From 2585ce678590b0df744f81849ab6842f1585a536 Mon Sep 17 00:00:00 2001
From: semio <semio@posteo.net>
Date: Sun, 19 Jan 2025 10:28:43 +0800
Subject: [PATCH] remove yival folder

---
 automation-api/yival_experiments/.gitignore   |   12 -
 automation-api/yival_experiments/README.md    |  103 --
 automation-api/yival_experiments/__init__.py  |    0
 .../custom_configuration/claude_evaluator.py  |  125 --
 .../claude_evaluator_config.py                |   18 -
 .../custom_configuration/evaluator_common.py  |   89 --
 .../example_evaluator_data.py                 |   50 -
 .../custom_configuration/gpt4_evaluator.py    |  126 --
 .../gpt4_evaluator_config.py                  |   18 -
 .../custom_configuration/llama3_evaluator.py  |  133 --
 .../llama3_evaluator_config.py                |   18 -
 .../llms/alibaba_complete.py                  |  155 ---
 .../llms/palm_completion.py                   |   18 -
 .../custom_configuration/model_compare.py     |  162 ---
 .../model_config_variation_generator.py       |   50 -
 ...model_config_variation_generator_config.py |    9 -
 .../model_config_wrapper.py                   |   34 -
 .../model_config_wrapper_config.py            |   12 -
 .../custom_configuration/question_reader.py   |   53 -
 .../question_reader_config.py                 |   13 -
 .../custom_configuration/simple_evaluator.py  |  171 ---
 .../simple_evaluator_config.py                |   14 -
 .../vertex_ai_evaluator.py                    |  141 ---
 .../vertex_ai_evaluator_config.py             |   18 -
 .../data/questions_en-US.csv                  |  366 ------
 .../data/questions_zh-CN.csv                  |  285 -----
 .../experiment_defaults.yaml                  |   40 -
 .../yival_experiments/experiment_example.yaml |  110 --
 .../notebooks/compare_evaluators.py           |   60 -
 .../notebooks/final_scores.py                 |  170 ---
 .../notebooks/human_rating.py                 |  141 ---
 .../notebooks/result_data_analysis.py         | 1089 -----------------
 .../notebooks/upload_to_ai_eval_sheet.py      |  288 -----
 .../1_number_of_average_answers.csv           |   35 -
 .../output/report_tables/2_average_rates.csv  |   35 -
 .../3_correct_rate_by_prompt.csv              |    4 -
 .../scripts/fetch_questions.py                |   72 --
 .../scripts/generate_experiment_config.py     |  194 ---
 .../scripts/generate_result.py                |  101 --
 .../experiment_20231104_cn.yaml               |    0
 .../experiment_20231104_en.yaml               |    0
 .../experiment_202401260846_en-US.yaml        |    0
 .../experiment_202401281713_zh-CN.yaml        |    0
 .../experiment_202401292237_en-US.yaml        |    0
 ..._202402011555_gemini_gemini-pro_en-US.yaml |    0
 ...202402011555_gpt-3-5-turbo-0613_en-US.yaml |    0
 ...202402011555_gpt-3-5-turbo-1106_en-US.yaml |    0
 ...202402011555_gpt-4-0125-preview_en-US.yaml |    0
 ...eriment_202402011555_gpt-4-0613_en-US.yaml |    0
 ...202402011555_gpt-4-1106-preview_en-US.yaml |    0
 ...nt_202402011555_palm_chat-bison_en-US.yaml |    0
 ...ef40397be9033abf9fd2badfe68c9e3_en-US.yaml |    0
 .../experiment_202402012248_zh-CN.yaml        |    0
 ...202402212350_gpt-4-0125-preview_en-US.yaml |    0
 ...402271117_gemini_gemini-1-0-pro_en-US.yaml |    0
 ...ment_202403061101_qwen-max-1201_zh-CN.yaml |    0
 ...202403291214_gpt-4-0125-preview_en-US.yaml |    0
 ...403291248_gemini_gemini-1-0-pro_en-US.yaml |    0
 ...403291536_gemini_gemini-1-0-pro_en-US.yaml |    0
 ...ment_202404011622_qwen-max-1201_zh-CN.yaml |    0
 ...202404051719_gpt-4-0125-preview_en-US.yaml |    0
 ...ment_202404102325_qwen-max-1201_zh-CN.yaml |    0
 ...201136_vertex_ai_gemini-1-5-pro_en-US.yaml |    0
 ..._ai_gemini-1-5-pro-preview-0409_en-US.yaml |    0
 ...ment_202405012311_qwen-max-0403_zh-CN.yaml |    0
 ..._ai_gemini-1-5-pro-preview-0409_en-US.yaml |    0
 ...ment_202405162244_qwen-max-0403_zh-CN.yaml |    0
 ...ment_202405162248_qwen-max-0403_zh-CN.yaml |    0
 ..._202405242125_gpt-4o-2024-05-13_en-US.yaml |    0
 ..._meta_meta-llama-3-70b-instruct_en-US.yaml |    0
 ...ertex_ai_claude-3-opus@20240229_en-US.yaml |    0
 ...ment_202406040141_qwen-max-0428_en-US.yaml |    0
 ..._202408291204_gpt-4o-2024-08-06_en-US.yaml |    0
 ...x_ai_claude-3-5-sonnet@20240620_en-US.yaml |    0
 ...models_llama-v3p1-405b-instruct_en-US.yaml |    0
 ...02409211350_qwen-max-2024-09-19_en-US.yaml |    0
 ...ment_202411221101_xai_grok-beta_en-US.yaml |    0
 ...models_llama-v3p1-405b-instruct_en-US.yaml |    0
 ..._202412052345_gpt-4o-2024-08-06_en-US.yaml |    0
 ...x_ai_claude-3-5-sonnet@20240620_en-US.yaml |    0
 ...02412060713_qwen-max-2024-09-19_en-US.yaml |    0
 ...47_vertex_ai_gemini-1-5-pro-002_en-US.yaml |    0
 ...ment_202412061914_xai_grok-beta_en-US.yaml |    0
 83 files changed, 4532 deletions(-)
 delete mode 100644 automation-api/yival_experiments/.gitignore
 delete mode 100644 automation-api/yival_experiments/README.md
 delete mode 100644 automation-api/yival_experiments/__init__.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/claude_evaluator.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/claude_evaluator_config.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/evaluator_common.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/example_evaluator_data.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/gpt4_evaluator.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/gpt4_evaluator_config.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/llama3_evaluator.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/llama3_evaluator_config.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/llms/alibaba_complete.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/llms/palm_completion.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/model_compare.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/model_config_variation_generator.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/model_config_variation_generator_config.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/model_config_wrapper.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/model_config_wrapper_config.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/question_reader.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/question_reader_config.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/simple_evaluator.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/simple_evaluator_config.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator.py
 delete mode 100644 automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator_config.py
 delete mode 100644 automation-api/yival_experiments/data/questions_en-US.csv
 delete mode 100644 automation-api/yival_experiments/data/questions_zh-CN.csv
 delete mode 100644 automation-api/yival_experiments/experiment_defaults.yaml
 delete mode 100644 automation-api/yival_experiments/experiment_example.yaml
 delete mode 100644 automation-api/yival_experiments/notebooks/compare_evaluators.py
 delete mode 100644 automation-api/yival_experiments/notebooks/final_scores.py
 delete mode 100644 automation-api/yival_experiments/notebooks/human_rating.py
 delete mode 100644 automation-api/yival_experiments/notebooks/result_data_analysis.py
 delete mode 100644 automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
 delete mode 100644 automation-api/yival_experiments/output/report_tables/1_number_of_average_answers.csv
 delete mode 100644 automation-api/yival_experiments/output/report_tables/2_average_rates.csv
 delete mode 100644 automation-api/yival_experiments/output/report_tables/3_correct_rate_by_prompt.csv
 delete mode 100644 automation-api/yival_experiments/scripts/fetch_questions.py
 delete mode 100644 automation-api/yival_experiments/scripts/generate_experiment_config.py
 delete mode 100644 automation-api/yival_experiments/scripts/generate_result.py
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_20231104_cn.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_20231104_en.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202401260846_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202401281713_zh-CN.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202401292237_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402011555_gemini_gemini-pro_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-0613_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-1106_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402011555_gpt-4-0125-preview_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402011555_gpt-4-0613_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402011555_gpt-4-1106-preview_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402011555_palm_chat-bison_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402011555_replicate_llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402012248_zh-CN.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402212350_gpt-4-0125-preview_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202402271117_gemini_gemini-1-0-pro_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202403061101_qwen-max-1201_zh-CN.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202403291214_gpt-4-0125-preview_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202403291248_gemini_gemini-1-0-pro_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202403291536_gemini_gemini-1-0-pro_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202404011622_qwen-max-1201_zh-CN.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202404051719_gpt-4-0125-preview_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202404102325_qwen-max-1201_zh-CN.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202404201136_vertex_ai_gemini-1-5-pro_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202404201344_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202405012311_qwen-max-0403_zh-CN.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202405162215_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202405162244_qwen-max-0403_zh-CN.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202405242125_gpt-4o-2024-05-13_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202405291053_vertex_ai_claude-3-opus@20240229_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202406040141_qwen-max-0428_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202408291204_gpt-4o-2024-08-06_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202408310828_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202409102304_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202411221101_xai_grok-beta_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202412052345_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202412052345_gpt-4o-2024-08-06_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202412052345_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202412060713_qwen-max-2024-09-19_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202412061047_vertex_ai_gemini-1-5-pro-002_en-US.yaml (100%)
 rename {automation-api/yival_experiments => experiments/yival_experiment_archives}/experiment_configurations/experiment_202412061914_xai_grok-beta_en-US.yaml (100%)

diff --git a/automation-api/yival_experiments/.gitignore b/automation-api/yival_experiments/.gitignore
deleted file mode 100644
index 64b001b..0000000
--- a/automation-api/yival_experiments/.gitignore
+++ /dev/null
@@ -1,12 +0,0 @@
-*.pyc
-*.egg-info
-*.bak
-.coverage
-.tox
-.venv
-test.db
-.ipynb_checkpoints
-dist
-.env
-*.ipynb
-requirements.txt
diff --git a/automation-api/yival_experiments/README.md b/automation-api/yival_experiments/README.md
deleted file mode 100644
index 0f04d37..0000000
--- a/automation-api/yival_experiments/README.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# How to run experiment
-
-## 1. Install automation-api library dependencies
-
-``` shell
-cd /path/to/gapminder-ai/automation-api
-poetry install
-```
-
-## 2. Enable poetry shell
-
-``` shell
-poetry shell
-```
-
-## 3. Fetch questions
-
-``` shell
-poe fetch_questions
-```
-
-This will fetch all enabled questions in the AI eval spreadsheet and create data/questions_{language}.csv files, one per language.
-
-Note: Yival requires a dataset to have a local file when the source type is set to "dataset". So we need to fetch it first.
-
-## 4. Generate experiment config
-
-To generate experiment configuration based on the current settings in the AI Eval Spreadsheet:
-
-``` shell
-poe generate_experiment_config
-```
-
-This generates one experiment configuration per language and stores them in `./yival_experiments/experiment_configurations/`.
-
-## 5. Start Redis for caching
-
-The model compare function will cache LLM call results for the
-evaluator, and by default the cache is dictionary in memory.
-Redis is used by default for caching, so that it won't lose the cache when Yival
-exits. start a local redis server:
-
-``` shell
-poe start_redis
-```
-
-Note: To not use Redis, comment the line for redis cache in the top
-of `custom_configuration/model_compare.py` and
-
-## 6. Run an experiment
-
-To run a particular experiment configuration (in `./yival_experiments/experiment_configurations/`):
-
-``` shell
-poe run_experiment --experiment=experiment_name
-```
-
-This will use the configuration experiment_name.yaml in `./yival_experiments/experiment_configurations/`
-and output a pickle file in `./yival_experiments/output/experiment_name_en-US_0.pkl` which includes all Experiment Results objects.
-
-When the experiment is completed, Yival will start a web server to show the results.
-
-### Setup environment variables
-Here are a list of environment variables that needed to be set (in automation-api/.env file) before running experiments, depending on the model we are testing:
-
-- OpenAI models: OPENAI_API_KEY and OPENAI_ORG_ID
-- Hugging Face models: HUGGINGFACEHUB_API_TOKEN
-- Replicate models: REPLICATE_API_KEY
-- Alibaba models: DASHSCOPE_API_KEY
-- Google Gemini API: GEMINI_API_KEY
-- VertexAI models: VERTEX_SERVICE_ACCOUNT_CREDENTIALS, VERTEXAI_PROJECT and VERTEXAI_LOCATIONS
-
-Some notes on VertexAI:
-
-- VERTEXAI_LOCATIONS can be a comma separated list of gcp regions, to get over the limit of 5 requests per minute of Gemini
-- follow the instruction in [DEV.md](https://github.com/Gapminder/gapminder-ai/blob/main/automation-api/DEV.md#obtaining-developer-specific-service-account-credentials-base64-encoded) to obtain VERTEX_SERVICE_ACCOUNT_CREDENTIALS
-
-## 7. Generate a result xlsx from output
-
-To convert the pickle files to Excel file:
-
-``` shell
-poe generate_result
-```
-
-This will read all pickles in output/ directory and will generate `results.xlsx` in output/ directory.
-
-TODO: We can add a custom evaluator in Yival to calculate the final scores.
-
-## 8. Calculate scores, upload results to AI Eval Spreadsheet
-
-Two notebooks in `./yival_experiments/notebooks/` directory are provided for calculating scores.
-
-- upload_to_ai_eval_sheet.py: generate the result table and upload to the `Latest Results` sheet in AI Eval Spreadsheet
-- result_data_analysis.py: compute statistics from the results
-
-Start Jupyter:
-
-```shell
-poe notebooks
-```
-
-Then open the notebooks in the browser and run them.
diff --git a/automation-api/yival_experiments/__init__.py b/automation-api/yival_experiments/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/automation-api/yival_experiments/custom_configuration/claude_evaluator.py b/automation-api/yival_experiments/custom_configuration/claude_evaluator.py
deleted file mode 100644
index a191ea6..0000000
--- a/automation-api/yival_experiments/custom_configuration/claude_evaluator.py
+++ /dev/null
@@ -1,125 +0,0 @@
-"""
-ClaudeEvaluator is an evaluator that uses Anthropic's Claude model for evaluations.
-
-The evaluator interfaces with Claude via litellm to present tasks and interpret
-the model's responses to determine the quality or correctness of a given
-experiment result.
-"""
-import copy
-import logging
-
-import litellm
-from claude_evaluator_config import ClaudeEvaluatorConfig
-from evaluator_common import (
-    CLASSIFY_STR,
-    calculate_choice_score,
-    choices_to_string,
-    completion_with_backpff,
-    extract_choice_from_response,
-    format_template,
-)
-from yival.evaluators.base_evaluator import BaseEvaluator
-from yival.schemas.evaluator_config import (
-    EvaluatorOutput,
-    EvaluatorType,
-    MethodCalculationMethod,
-    MetricCalculatorConfig,
-)
-from yival.schemas.experiment_config import (
-    ExperimentResult,
-    InputData,
-    MultimodalOutput,
-)
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class ClaudeEvaluator(BaseEvaluator):
-    """Evaluator using Claude for evaluation."""
-
-    default_config = ClaudeEvaluatorConfig(name="claude_evaluator")  # type: ignore
-
-    def __init__(self, config: ClaudeEvaluatorConfig):
-        super().__init__(config)
-        self.config = config
-
-    def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
-        """Evaluate the experiment result using Claude."""
-        format_dict = copy.deepcopy(experiment_result.input_data.content)
-        format_dict["raw_output"] = experiment_result.raw_output.text_output
-
-        prompt = format_template(self.config.prompt, format_dict)
-        if isinstance(prompt, str):
-            prompt = [{"role": "user", "content": prompt}]
-
-        prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format(
-            choices=choices_to_string(self.config.choices)
-        )
-        response = completion_with_backpff(
-            model=self.config.model_name,
-            messages=prompt,
-            temperature=0.0,
-            n=1,
-            max_tokens=2000,
-            request_timeout=60,
-            caching=True,
-        )
-        response_content = response["choices"][0]["message"]["content"]
-        choice = extract_choice_from_response(response_content, self.config.choices)
-        score = calculate_choice_score(choice, self.config.choice_scores)
-        return EvaluatorOutput(
-            name=self.config.name,
-            result=score if score is not None else choice,
-            display_name=self.config.display_name,
-            metric_calculators=self.config.metric_calculators,
-        )
-
-
-BaseEvaluator.register_evaluator(
-    "claude_evaluator", ClaudeEvaluator, ClaudeEvaluatorConfig
-)
-
-
-def main():
-    """Main function to test the ClaudeEvaluator."""
-    from example_evaluator_data import (
-        choice_scores,
-        choices,
-        content,
-        prompt,
-        raw_output,
-    )
-
-    litellm.set_verbose = True
-
-    evaluator_config = ClaudeEvaluatorConfig(
-        name="claude_evaluator",
-        display_name="correctness test",
-        metric_calculators=[
-            MetricCalculatorConfig(
-                MethodCalculationMethod(MethodCalculationMethod.AVERAGE)
-            )
-        ],
-        prompt=prompt,
-        choices=choices,
-        evaluator_type=EvaluatorType.INDIVIDUAL,
-        choice_scores=choice_scores,
-    )
-    input_data_example = InputData(content=content)
-
-    experiment_result_example = ExperimentResult(
-        input_data=input_data_example,
-        combination={"wrapper1": "var1", "wrapper2": "var2"},
-        raw_output=MultimodalOutput(text_output=raw_output),
-        latency=150.0,
-        token_usage=50,
-    )
-
-    evaluator = ClaudeEvaluator(evaluator_config)
-    result = evaluator.evaluate(experiment_result_example)
-    print("Result: ", result.result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/automation-api/yival_experiments/custom_configuration/claude_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/claude_evaluator_config.py
deleted file mode 100644
index 4290163..0000000
--- a/automation-api/yival_experiments/custom_configuration/claude_evaluator_config.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, List, Optional, Union
-
-from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType
-
-
-@dataclass
-class ClaudeEvaluatorConfig(EvaluatorConfig):
-    evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
-    prompt: Union[str, List[Dict[str, str]]] = ""
-    choices: List[str] = field(default_factory=list)
-    model_name: str = "claude-3-5-sonnet-20241022"
-    description: str = "This is an evaluator that uses Anthropic's Claude model."
-    scale_description: str = "0-4"
-    choice_scores: Optional[Dict[str, float]] = None
-
-    def asdict(self) -> Dict[str, Any]:
-        return asdict(self)
diff --git a/automation-api/yival_experiments/custom_configuration/evaluator_common.py b/automation-api/yival_experiments/custom_configuration/evaluator_common.py
deleted file mode 100644
index 0370879..0000000
--- a/automation-api/yival_experiments/custom_configuration/evaluator_common.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""common functions for the evaluators"""
-
-import copy
-import logging
-import string
-from typing import Any, Dict, Iterable, List, Optional, Union
-
-import litellm
-from tenacity import before_sleep_log, retry, stop_after_attempt, wait_random
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-CLASSIFY_STR = """
-First, write out in a step by step manner your reasoning to be sure that your conclusion is correct.
-Avoid simply stating the correct answer at the outset.
-Then print only a single choice from {choices} (including letter and option text) on its own line corresponding to the correct answer of your task.
-At the end, repeat just the choice (one letter, without quotes or punctuation) by itself on a new line.
-Reasoning:
-"""
-
-MATCH_FNS = {
-    "include": lambda x, y: float(x in y),
-    "exact": lambda x, y: float(x == y),
-    "endswith": lambda x, y: x.endswith(y),
-    "starts_or_endswith": lambda x, y: x.startswith(y) or x.endswith(y),
-}
-
-
-def extract_choice_from_response(response: str, choice_strings: Iterable[str]) -> str:
-    """Extracts the choice from the response string."""
-    lines = response.strip().split("\n")
-    for line in lines[::-1]:
-        sanitized_line = "".join(c for c in line if c not in string.punctuation).strip()
-        if not sanitized_line:
-            continue
-        for choice in choice_strings:
-            if MATCH_FNS["exact"](sanitized_line, choice):
-                return choice
-    return "invalid response"
-
-
-def calculate_choice_score(
-    choice: str, choice_scores: Optional[Dict[str, float]] = None
-) -> Optional[float]:
-    """Calculates the score for the given choice."""
-    if choice_scores is None:
-        return None
-    if choice == "invalid response":
-        return min(choice_scores.values())
-    return choice_scores.get(choice)
-
-
-def format_template(
-    template: Union[str, List[Dict[str, str]]], content: Dict[str, Any]
-) -> Union[str, List[Dict[str, str]]]:
-    """Formats a string or list template with the provided content."""
-    if isinstance(template, str):
-        try:
-            return template.format(**content)
-        except KeyError as e:
-            raise ValueError(f"Missing key {e} in content dictionary")
-
-    res = []
-    for t in template:
-        formatted_msg = copy.deepcopy(t)
-        try:
-            if "content" in formatted_msg:
-                formatted_msg["content"] = formatted_msg["content"].format(**content)
-        except KeyError as e:
-            raise ValueError(f"Missing key {e} in content dictionary")
-        res.append(formatted_msg)
-    return res
-
-
-@retry(
-    wait=wait_random(min=1, max=20),
-    stop=stop_after_attempt(100),
-    before_sleep=before_sleep_log(logger, logging.DEBUG),
-)
-def completion_with_backpff(**kwargs):
-    # response = openai.ChatCompletion.create(**kwargs)
-    response = litellm.completion(**kwargs)
-    return response
-
-
-def choices_to_string(choice_strings: Iterable[str]) -> str:
-    """Converts a list of choices into a formatted string."""
-    return " or ".join(f'"{choice}"' for choice in choice_strings)
diff --git a/automation-api/yival_experiments/custom_configuration/example_evaluator_data.py b/automation-api/yival_experiments/custom_configuration/example_evaluator_data.py
deleted file mode 100644
index 3255e12..0000000
--- a/automation-api/yival_experiments/custom_configuration/example_evaluator_data.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# example data for testing evaluators. the main function in gpt4_evaluator.py and other evaluators require this file
-
-import pandas as pd
-
-example_question_id = "1737"
-example_question_lang = "en"  # 'cn' or 'en'
-
-raw_output = """Understanding public perception of climate change is crucial for railway workers because it can influence policy decisions, funding, and the future direction of infrastructure projects. If a significant portion of the population views climate change as a threat, there may be increased investment in sustainable and resilient railway systems. This could lead to job security, new training opportunities, and the adoption of innovative technologies within the railway industry.
-
-The correct answer is:
-C. Around 80%
-
-Simple phrasing for railway workers:
-\"About 80% of people in high-income countries see climate change as a threat to their country over the next 20 years.\""""
-
-en_question_file = "../data/questions_en-US.csv"
-cn_question_file = "../data/questions_zh-CN.csv"
-
-prompt = """You are assessing a submitted answer on a given task based on a criterion. Here is the data:
-- Task: Answer a question.
-- Is the answer correct?
-[Input]: {question_text}
-Possible Options and Correctness of Options:
-A. {option_a} ({option_a_correctness})
-B. {option_b} ({option_b_correctness})
-C. {option_c} ({option_c_correctness})
-[Result]: {raw_output}
-Assess the answer by selecting one of the following options. Please note that the question was asked in some context so the answer would contain some more contextual information. You should conclude which option this answer has chosen.
-A The answer looks like some kind of exception / error message; or it's an equivocal answer; or it doesn't answer to the question at all.
-B The answer is not correct, and matches/is close to the Very Wrong answer.
-C The answer is not correct, and matches/is close to the Wrong answer.
-D The answer is correct, i.e. matches the Correct answer."""
-
-choices = ["A", "B", "C", "D"]
-
-choice_scores = dict(zip(choices, [0, 1, 2, 3]))
-
-
-def get_example_input(question_id, lang):
-    if lang == "en":
-        df = pd.read_csv(en_question_file, dtype="str")
-    else:
-        df = pd.read_csv(cn_question_file, dtype="str")
-
-    question_data = df.loc[df["question_id"] == question_id]
-    return question_data.iloc[0].to_dict()
-
-
-# print(get_example_input(example_question_id, example_question_lang))
-content = get_example_input(example_question_id, example_question_lang)
diff --git a/automation-api/yival_experiments/custom_configuration/gpt4_evaluator.py b/automation-api/yival_experiments/custom_configuration/gpt4_evaluator.py
deleted file mode 100644
index 329c270..0000000
--- a/automation-api/yival_experiments/custom_configuration/gpt4_evaluator.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""
-OpenAIPromptBasedEvaluator is an evaluator that uses OpenAI's prompt-based
-system for evaluations.
-
-The evaluator interfaces with the OpenAI API to present tasks and interpret
-the model's responses to determine the quality or correctness of a given
-experiment result.
-"""
-import copy
-import logging
-
-import litellm
-from evaluator_common import (
-    CLASSIFY_STR,
-    calculate_choice_score,
-    choices_to_string,
-    completion_with_backpff,
-    extract_choice_from_response,
-    format_template,
-)
-from gpt4_evaluator_config import GPT4EvaluatorConfig
-from yival.evaluators.base_evaluator import BaseEvaluator
-from yival.schemas.evaluator_config import (
-    EvaluatorOutput,
-    EvaluatorType,
-    MethodCalculationMethod,
-    MetricCalculatorConfig,
-)
-from yival.schemas.experiment_config import (
-    ExperimentResult,
-    InputData,
-    MultimodalOutput,
-)
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class GPT4Evaluator(BaseEvaluator):
-    """Evaluator using OpenAI's prompt-based evaluation."""
-
-    default_config = GPT4EvaluatorConfig(name="gpt4_evaluator")  # type: ignore
-
-    def __init__(self, config: GPT4EvaluatorConfig):
-        super().__init__(config)
-        self.config = config
-
-    def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
-        """Evaluate the experiment result using OpenAI's prompt-based evaluation."""
-        format_dict = copy.deepcopy(experiment_result.input_data.content)
-        format_dict["raw_output"] = experiment_result.raw_output.text_output
-
-        prompt = format_template(self.config.prompt, format_dict)
-        if isinstance(prompt, str):
-            prompt = [{"role": "user", "content": prompt}]
-
-        prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format(
-            choices=choices_to_string(self.config.choices)
-        )
-        response = completion_with_backpff(
-            model=self.config.model_name,
-            messages=prompt,
-            temperature=0.0,
-            n=1,
-            max_tokens=2000,
-            request_timeout=60,
-            caching=True,
-        )
-        # response = openai.ChatCompletion.create(
-        #     model="gpt-4", messages=prompt, temperature=0.5)
-        response_content = response["choices"][0]["message"]["content"]
-        choice = extract_choice_from_response(response_content, self.config.choices)
-        score = calculate_choice_score(choice, self.config.choice_scores)
-        return EvaluatorOutput(
-            name=self.config.name,
-            result=score if score is not None else choice,
-            display_name=self.config.display_name,
-            metric_calculators=self.config.metric_calculators,
-        )
-
-
-BaseEvaluator.register_evaluator("gpt4_evaluator", GPT4Evaluator, GPT4EvaluatorConfig)
-
-
-def main():
-    """Main function to test the OpenAIPromptBasedEvaluator."""
-    from example_evaluator_data import (
-        choice_scores,
-        choices,
-        content,
-        prompt,
-        raw_output,
-    )
-
-    litellm.set_verbose = True
-
-    evaluator_config = GPT4EvaluatorConfig(
-        name="gpt4_evaluator",
-        display_name="correctness test",
-        metric_calculators=[
-            MetricCalculatorConfig(
-                MethodCalculationMethod(MethodCalculationMethod.AVERAGE)
-            )
-        ],
-        prompt=prompt,
-        choices=choices,
-        evaluator_type=EvaluatorType.INDIVIDUAL,
-        choice_scores=choice_scores,
-    )
-    input_data_example = InputData(content=content)
-
-    experiment_result_example = ExperimentResult(
-        input_data=input_data_example,
-        combination={"wrapper1": "var1", "wrapper2": "var2"},
-        raw_output=MultimodalOutput(text_output=raw_output),
-        latency=150.0,
-        token_usage=50,
-    )
-
-    evaluator = GPT4Evaluator(evaluator_config)
-    result = evaluator.evaluate(experiment_result_example)
-    print("Result: ", result.result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/automation-api/yival_experiments/custom_configuration/gpt4_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/gpt4_evaluator_config.py
deleted file mode 100644
index aa9e0af..0000000
--- a/automation-api/yival_experiments/custom_configuration/gpt4_evaluator_config.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, List, Optional, Union
-
-from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType
-
-
-@dataclass
-class GPT4EvaluatorConfig(EvaluatorConfig):
-    evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
-    prompt: Union[str, List[Dict[str, str]]] = ""
-    choices: List[str] = field(default_factory=list)
-    model_name: str = "gpt-4o"
-    description: str = "This is the description of the evaluator."
-    scale_description: str = "0-4"
-    choice_scores: Optional[Dict[str, float]] = None
-
-    def asdict(self) -> Dict[str, Any]:
-        return asdict(self)
diff --git a/automation-api/yival_experiments/custom_configuration/llama3_evaluator.py b/automation-api/yival_experiments/custom_configuration/llama3_evaluator.py
deleted file mode 100644
index ff82d2b..0000000
--- a/automation-api/yival_experiments/custom_configuration/llama3_evaluator.py
+++ /dev/null
@@ -1,133 +0,0 @@
-"""
-OpenAIPromptBasedEvaluator is an evaluator that uses OpenAI's prompt-based
-system for evaluations.
-
-The evaluator interfaces with the OpenAI API to present tasks and interpret
-the model's responses to determine the quality or correctness of a given
-experiment result.
-"""
-import copy
-import logging
-
-import litellm
-from evaluator_common import (
-    CLASSIFY_STR,
-    calculate_choice_score,
-    choices_to_string,
-    completion_with_backpff,
-    extract_choice_from_response,
-    format_template,
-)
-from llama3_evaluator_config import Llama3EvaluatorConfig
-from yival.evaluators.base_evaluator import BaseEvaluator
-from yival.schemas.evaluator_config import (
-    EvaluatorOutput,
-    EvaluatorType,
-    MethodCalculationMethod,
-    MetricCalculatorConfig,
-)
-from yival.schemas.experiment_config import (
-    ExperimentResult,
-    InputData,
-    MultimodalOutput,
-)
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class Llama3Evaluator(BaseEvaluator):
-    """Evaluator using OpenAI's prompt-based evaluation."""
-
-    default_config = Llama3EvaluatorConfig(name="llama3_evaluator")  # type: ignore
-
-    def __init__(self, config: Llama3EvaluatorConfig):
-        super().__init__(config)
-        self.config = config
-
-    def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
-        """Evaluate the experiment result using OpenAI's prompt-based evaluation."""
-        assert isinstance(self.config, Llama3EvaluatorConfig)
-        format_dict = copy.deepcopy(experiment_result.input_data.content)
-        format_dict["raw_output"] = experiment_result.raw_output.text_output
-
-        prompt = format_template(self.config.prompt, format_dict)
-        if isinstance(prompt, str):
-            prompt = [{"role": "user", "content": prompt}]
-
-        prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format(
-            choices=choices_to_string(self.config.choices)
-        )
-        response = completion_with_backpff(
-            model=self.config.model_name,
-            messages=prompt,
-            temperature=0.0,
-            n=1,
-            max_tokens=2000,
-            request_timeout=60,
-            caching=True,
-        )
-        # response = openai.ChatCompletion.create(
-        #     model="gpt-4", messages=prompt, temperature=0.5)
-        response_content = response["choices"][0]["message"]["content"]
-        choice = extract_choice_from_response(response_content, self.config.choices)
-        score = calculate_choice_score(choice, self.config.choice_scores)
-        return EvaluatorOutput(
-            name=self.config.name,
-            result=score if score is not None else choice,
-            display_name=self.config.display_name,
-            metric_calculators=self.config.metric_calculators,
-        )
-
-
-BaseEvaluator.register_evaluator(
-    "llama3_evaluator", Llama3Evaluator, Llama3EvaluatorConfig
-)
-
-
-def main():
-    """Main function to test the OpenAIPromptBasedEvaluator."""
-    from example_evaluator_data import (
-        choice_scores,
-        choices,
-        content,
-        prompt,
-        raw_output,
-    )
-
-    from lib.config import read_config
-
-    read_config()
-
-    litellm.set_verbose = True
-
-    evaluator_config = Llama3EvaluatorConfig(
-        name="llama3_evaluator",
-        display_name="correctness test",
-        metric_calculators=[
-            MetricCalculatorConfig(
-                MethodCalculationMethod(MethodCalculationMethod.AVERAGE)
-            )
-        ],
-        prompt=prompt,
-        choices=choices,
-        evaluator_type=EvaluatorType.INDIVIDUAL,
-        choice_scores=choice_scores,
-    )
-    input_data_example = InputData(content=content)
-
-    experiment_result_example = ExperimentResult(
-        input_data=input_data_example,
-        combination={"wrapper1": "var1", "wrapper2": "var2"},
-        raw_output=MultimodalOutput(text_output=raw_output),
-        latency=150.0,
-        token_usage=50,
-    )
-
-    evaluator = Llama3Evaluator(evaluator_config)
-    result = evaluator.evaluate(experiment_result_example)
-    print("Result: ", result.result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/automation-api/yival_experiments/custom_configuration/llama3_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/llama3_evaluator_config.py
deleted file mode 100644
index 5a73392..0000000
--- a/automation-api/yival_experiments/custom_configuration/llama3_evaluator_config.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, List, Optional, Union
-
-from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType
-
-
-@dataclass
-class Llama3EvaluatorConfig(EvaluatorConfig):
-    evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
-    prompt: Union[str, List[Dict[str, str]]] = ""
-    choices: List[str] = field(default_factory=list)
-    model_name: str = "fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct"
-    description: str = "This is the description of the evaluator."
-    scale_description: str = "0-4"
-    choice_scores: Optional[Dict[str, float]] = None
-
-    def asdict(self) -> Dict[str, Any]:
-        return asdict(self)
diff --git a/automation-api/yival_experiments/custom_configuration/llms/alibaba_complete.py b/automation-api/yival_experiments/custom_configuration/llms/alibaba_complete.py
deleted file mode 100644
index 63c27f8..0000000
--- a/automation-api/yival_experiments/custom_configuration/llms/alibaba_complete.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# encoding: utf-8
-
-import random
-from http import HTTPStatus
-
-import dashscope
-from dashscope import Generation
-from tenacity import (
-    retry,
-    retry_if_exception_type,
-    retry_if_not_result,
-    stop_after_attempt,
-    wait_random_exponential,
-)
-
-from lib.config import read_config
-
-
-def response_is_ok(response):
-    if response.status_code == HTTPStatus.OK:
-        return True
-    return False
-
-
-def return_last_message(retry_state):
-    last_val = retry_state.outcome.result()
-    result = {"output": {"text": f"Error: {last_val.code}: {last_val.message}"}}
-    return result
-
-
-@retry(
-    retry=(retry_if_exception_type() | retry_if_not_result(response_is_ok)),
-    stop=stop_after_attempt(10),
-    wait=wait_random_exponential(multiplier=1, min=5, max=80),
-    retry_error_callback=return_last_message,
-)
-def get_reply(**kwargs):
-    return Generation.call(**kwargs)
-
-
-""" Here we need to convert the reply from alibaba into openai's output format.
-
-Alibaba:
-```
-{
-    "status_code": 200,
-    "request_id": "05dc83af-7185-9e14-9b0b-4466de159d6a",
-    "code": "",
-    "message": "",
-    "output": {
-        "text": null
-        "finish_reason": null
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "message": {
-              "role": "assistant",
-              "content": "对于有编程基础的人，..."
-            }
-          }
-        ],
-    },
-    "usage": {
-        "input_tokens": 12,
-        "output_tokens": 98,
-        "total_tokens": 110
-    }
-}
-```
-
-openai:
-```
-{
-  "id": "chatcmpl-123",
-  "object": "chat.completion",
-  "created": 1677652288,
-  "model": "gpt-3.5-turbo-0613",
-  "choices": [{
-    "index": 0,
-    "message": {
-      "role": "assistant",
-      "content": "\n\nHello there, how may I assist you today?",
-    },
-    "finish_reason": "stop"
-  }],
-  "usage": {
-    "prompt_tokens": 9,
-    "completion_tokens": 12,
-    "total_tokens": 21
-  }
-}
-```
-
-The only thing need to translate is the `output` key in alibaba
-"""
-
-
-def llm_complete(
-    model_name,
-    prompt,
-    temperature=1.0,
-    top_p=0.8,
-    top_k=None,
-    enable_search=False,
-    dashscope_api_key=None,
-):
-    seed = random.randint(0, 2**63)
-    if not dashscope_api_key:
-        config = read_config()
-        if "DASHSCOPE_API_KEY" in config.keys():
-            dashscope_api_key = config["DASHSCOPE_API_KEY"]
-        else:
-            raise ValueError(
-                "please set DASHSCHPE_API_KEY in .env or \
-                provide the dashschpe_api_key parameter."
-            )
-    dashscope.api_key = dashscope_api_key
-
-    reply = get_reply(
-        model=model_name,
-        prompt=prompt,  # alternativly we can use the `messages` parameter. see doc.
-        top_p=top_p,
-        top_k=top_k,
-        seed=seed,
-        enable_search=enable_search,
-        result_format="message",
-    )
-    # fixing the format
-    output = reply.pop("output", None)
-    if (
-        output["text"] is not None
-    ):  # API reported an error. see return_last_message() above.
-        # let's just return the error message.
-        reply["choices"] = [
-            {
-                "finish_reason": "stop",
-                "message": {"role": "assistant", "content": output["text"]},
-            }
-        ]
-    else:
-        reply["choices"] = output["choices"]
-    return reply
-
-
-if __name__ == "__main__":
-    # NOTE: currently there is qwen-turbo and qwen-plus.
-    # qwen-plus is stronger than qwen-turbo
-    q = """请回答以下的选择题，如果你不确定也要根据你知道的信息选择一个答案。\n\
-    问题：全世界有多少大学生在自己的本国（而不是国外）获得学位？\n\
-    A. 大约 77%\n\
-    B. 大约 87%\n\
-    C. 大约 97%\n\
-
-    回答："""
-    print(llm_complete("qwen-max-1201", q, temperature=0.01, top_p=0.2, top_k=100))
diff --git a/automation-api/yival_experiments/custom_configuration/llms/palm_completion.py b/automation-api/yival_experiments/custom_configuration/llms/palm_completion.py
deleted file mode 100644
index 9ae82ec..0000000
--- a/automation-api/yival_experiments/custom_configuration/llms/palm_completion.py
+++ /dev/null
@@ -1,18 +0,0 @@
-safety_settings = [
-    {
-        "category": "HARM_CATEGORY_HARASSMENT",
-        "threshold": "BLOCK_ONLY_HIGH",
-    },
-    {
-        "category": "HARM_CATEGORY_HATE_SPEECH",
-        "threshold": "BLOCK_ONLY_HIGH",
-    },
-    {
-        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-        "threshold": "BLOCK_ONLY_HIGH",
-    },
-    {
-        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-        "threshold": "BLOCK_ONLY_HIGH",
-    },
-]
diff --git a/automation-api/yival_experiments/custom_configuration/model_compare.py b/automation-api/yival_experiments/custom_configuration/model_compare.py
deleted file mode 100644
index bda2611..0000000
--- a/automation-api/yival_experiments/custom_configuration/model_compare.py
+++ /dev/null
@@ -1,162 +0,0 @@
-import os
-import random
-
-import litellm
-from litellm import completion
-from model_config_wrapper import ModelConfigWrapper
-from yival.logger.token_logger import TokenLogger
-from yival.schemas.experiment_config import MultimodalOutput
-from yival.schemas.model_configs import Response
-from yival.states.experiment_state import ExperimentState
-from yival.wrappers.string_wrapper import StringWrapper
-
-# load env vars
-from lib.config import read_config
-from yival_experiments.custom_configuration.llms.palm_completion import safety_settings
-
-read_config()
-
-# default model config if not provided
-# default_model_config = dict(
-#     model_id="gpt-4o-2024-05-13",
-#     params={"temperature": 0.5},
-#     vendor="OpenAI"
-# )
-# default_model_config = dict(
-#     model_id="vertex_ai/gemini-pro-experimental",
-#     params={"temperature": 0.5},
-#     vendor="Google",
-# )
-# default_model_config = dict(
-#     model_id="vertex_ai/claude-3-opus@20240229",
-#     params={"temperature": 0.5},
-#     vendor="Anthropic",
-# )
-# default_model_config = dict(
-#     model_id="replicate/meta/meta-llama-3-70b-instruct",
-#     params={"temperature": 0.5},
-#     vendor="Meta",
-# )
-default_model_config = dict(
-    model_id="qwen-max", params={"temperature": 0.5}, vendor="Alibaba"
-)
-# set this to see verbose outputs
-litellm.set_verbose = True
-# enable caching in the evaluator.
-# litellm.cache = litellm.Cache()
-# to not use Redis for caching: uncomment the line above and comment the line below.
-litellm.cache = litellm.Cache(
-    type="redis", host="127.0.0.1", port=26379, ttl=60 * 24 * 3600
-)
-
-
-def model_compare(
-    question_id: str,
-    question_text: str,
-    language: str,
-    option_a: str,
-    option_a_correctness: str,
-    option_b: str,
-    option_b_correctness: str,
-    option_c: str,
-    option_c_correctness: str,
-    state: ExperimentState,
-) -> MultimodalOutput:
-    logger = TokenLogger()
-    logger.reset()
-
-    model = ModelConfigWrapper(
-        default_model_config, name="model_config", state=state
-    ).get_value()
-
-    prompt_template_default = """Answer following multiple choices question:
-    Question: {question_text}
-    A. {option_a}
-    B. {option_b}
-    C. {option_c}
-
-    Answer:"""
-    # TODO: there might be better way to handle variables in prompt variations.
-    prompt_template = str(StringWrapper("", name="prompt_template", state=state))
-    if prompt_template == "":
-        prompt_template = prompt_template_default
-
-    prompt = prompt_template.format(
-        question_text=question_text,
-        option_a=option_a,
-        option_b=option_b,
-        option_c=option_c,
-    )
-    # system_prompt = """..."""
-
-    # prepare model call parameters
-    litellm_messages = [
-        # {"content": system_prompt, "role": "system"},
-        {"content": prompt, "role": "user"}
-    ]
-
-    litellm_params = dict(
-        model=model["model_id"],
-        messages=litellm_messages,
-        caching=True,
-        num_retries=10,
-        request_timeout=60,
-        **model["params"],
-    )
-    if model["vendor"] == "Google":
-        # choose a vertex project location
-        litellm.vertex_location = random.choice(
-            os.environ["VERTEXAI_LOCATIONS"].split(",")
-        )
-        # google allows changing content filters. We will disable all
-        litellm_params["safety_settings"] = safety_settings
-    elif model["vendor"] == "Anthropic":
-        # all Anthropic models are abailable in us-east5
-        litellm.vertex_location = "us-east5"
-    elif model["vendor"] == "Alibaba":
-        # Alibaba has openai compatible endpoints
-        litellm_params["model"] = f"openai/{litellm_params['model']}"
-        litellm_params["api_key"] = os.getenv("DASHSCOPE_API_KEY")
-        litellm_params["api_base"] = "https://dashscope.aliyuncs.com/compatible-mode/v1"
-    try:
-        response = Response(output=completion(**litellm_params)).output
-        response_text = response["choices"][0]["message"]["content"]
-    except KeyboardInterrupt:
-        raise
-    except Exception as e:
-        print(str(e))
-        response = None
-        response_text = "No Answer. Reason:\n" + str(e)
-
-    res = MultimodalOutput(
-        text_output=response_text,
-    )
-    if type(response) is Response:
-        token_usage = response["usage"]["total_tokens"]
-        logger.log(token_usage)
-    else:
-        logger.log(0)
-    return res
-
-
-def main() -> None:
-    q = "How many people worldwide have their basic needs met when it comes to food, "
-    "water, toilets, electricity, schooling and healthcare?"
-    print(
-        model_compare(
-            "1",
-            q,
-            "en_US",
-            "Around 20%",
-            "3",
-            "Around 50%",
-            "2",
-            "Around 80%",
-            "1",
-            ExperimentState(),
-        )
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/automation-api/yival_experiments/custom_configuration/model_config_variation_generator.py b/automation-api/yival_experiments/custom_configuration/model_config_variation_generator.py
deleted file mode 100644
index f708a6a..0000000
--- a/automation-api/yival_experiments/custom_configuration/model_config_variation_generator.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from dataclasses import asdict, dataclass, field
-from typing import Any, Iterator, List, Optional
-
-from model_config_variation_generator_config import ModelConfigVariationGeneratorConfig
-
-# from yival.schemas.experiment_config import WrapperVariation
-# ^ this is not working for dict so I write my own version
-from yival.variation_generators.base_variation_generator import BaseVariationGenerator
-
-
-@dataclass
-class WrapperVariation:
-    """
-    Represents a variation within a wrapper.
-    The value can be any type, but typical usages might include strings,
-    numbers, configuration dictionaries, or even custom class configurations.
-    """
-
-    value_type: str  # e.g., "string", "int", "float", "ClassA", ...
-    value: Any  # The actual value or parameters to initialize a value
-    instantiated_value: Any = field(init=False)
-    variation_id: Optional[str] = None
-
-    def asdict(self):
-        return asdict(self)
-
-    def __post_init__(self):
-        self.instantiated_value = self.instantiate()
-
-    def instantiate(self) -> Any:
-        """
-        Returns an instantiated value based on value_type and params.
-        """
-        return self.value
-
-
-class ModelConfigVariationGenerator(BaseVariationGenerator):
-    def __init__(self, config: ModelConfigVariationGeneratorConfig):
-        super().__init__(config)
-        self.config = config
-
-    def generate_variations(self) -> Iterator[List[WrapperVariation]]:
-        if not self.config.models:
-            yield []
-        else:
-            variations = [
-                WrapperVariation(value_type="dict", value=var)
-                for var in self.config.models
-            ]
-            yield variations
diff --git a/automation-api/yival_experiments/custom_configuration/model_config_variation_generator_config.py b/automation-api/yival_experiments/custom_configuration/model_config_variation_generator_config.py
deleted file mode 100644
index 7369f2b..0000000
--- a/automation-api/yival_experiments/custom_configuration/model_config_variation_generator_config.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-
-from yival.schemas.varation_generator_configs import BaseVariationGeneratorConfig
-
-
-@dataclass
-class ModelConfigVariationGeneratorConfig(BaseVariationGeneratorConfig):
-    models: Optional[List[Dict[str, Any]]] = None  # List of variations to generate
diff --git a/automation-api/yival_experiments/custom_configuration/model_config_wrapper.py b/automation-api/yival_experiments/custom_configuration/model_config_wrapper.py
deleted file mode 100644
index 239ce79..0000000
--- a/automation-api/yival_experiments/custom_configuration/model_config_wrapper.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from typing import Any, Dict, Optional
-
-from model_config_wrapper_config import ModelConfigWrapperConfig
-from yival.experiment.experiment_runner import ExperimentState
-from yival.wrappers.base_wrapper import BaseWrapper
-
-
-class ModelConfigWrapper(BaseWrapper):
-    """
-    A wrapper for model configuration.
-
-    Configuration is a dictionary contains 2 keys:
-
-    - model_name: the name of model, which is a string
-    - params: the configuration of model, which is a dictionary
-    """
-
-    default_config = ModelConfigWrapperConfig()
-
-    def __init__(
-        self,
-        value: Dict[str, Any],
-        name: str,
-        config: Optional[ModelConfigWrapperConfig] = None,
-        state: Optional[ExperimentState] = None,
-    ) -> None:
-        super().__init__(name, config, state)
-        self._value = value
-
-    def get_value(self) -> Dict[str, Any]:
-        variation = self.get_variation()
-        if variation is not None:
-            return variation
-        return self._value
diff --git a/automation-api/yival_experiments/custom_configuration/model_config_wrapper_config.py b/automation-api/yival_experiments/custom_configuration/model_config_wrapper_config.py
deleted file mode 100644
index 8bf8725..0000000
--- a/automation-api/yival_experiments/custom_configuration/model_config_wrapper_config.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from dataclasses import dataclass
-
-from yival.schemas.wrapper_configs import BaseWrapperConfig
-
-
-@dataclass
-class ModelConfigWrapperConfig(BaseWrapperConfig):
-    """
-    Configuration specific to the ModelConfigWrapper.
-    """
-
-    pass
diff --git a/automation-api/yival_experiments/custom_configuration/question_reader.py b/automation-api/yival_experiments/custom_configuration/question_reader.py
deleted file mode 100644
index 0561c2a..0000000
--- a/automation-api/yival_experiments/custom_configuration/question_reader.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from typing import Iterator, List
-
-from question_reader_config import QuestionReaderConfig
-from yival.data.base_reader import BaseReader
-from yival.schemas.common_structures import InputData
-
-from lib.pilot.helpers import get_questions, read_ai_eval_spreadsheet
-
-
-class QuestionReader(BaseReader):
-    """
-    QuestionReader is ...
-
-    Attributes:
-        config (TXTReaderConfig): Configuration object specifying reader parameters.
-
-    Methods:
-        __init__(self, config: TXTReaderConfig): Initializes the TXTReader with
-        a given configuration.
-        read(self, path: str) -> Iterator[List[InputData]]: Reads the TXT file
-        and yields chunks of InputData.
-    """
-
-    config: QuestionReaderConfig
-    default_config = QuestionReaderConfig()
-
-    def __init__(self, config: QuestionReaderConfig):
-        super().__init__(config)
-        self.config = config
-
-    def read(self, path: str) -> Iterator[List[InputData]]:
-        sheet = read_ai_eval_spreadsheet()
-        questions = get_questions(sheet)
-
-        for q, opts in questions:
-            options_text = [f"{opt.letter}. {opt.question_option}" for opt in opts]
-            content = {
-                "question_id": q.question_id,
-                "question_text": q.published_version_of_question,
-                "options_text": options_text,
-            }
-            correct_answer = list(
-                filter(lambda x: x.correctness_of_answer_option == 1, opts)
-            )[0]
-            expected_result = (
-                f"{correct_answer.letter}. {correct_answer.question_option}"
-            )
-
-            example_id = self.generate_example_id({"content": content}, "")
-            input_data_instance = InputData(
-                example_id=example_id, content=content, expected_result=expected_result
-            )
-            yield [input_data_instance]
diff --git a/automation-api/yival_experiments/custom_configuration/question_reader_config.py b/automation-api/yival_experiments/custom_configuration/question_reader_config.py
deleted file mode 100644
index a27c9a7..0000000
--- a/automation-api/yival_experiments/custom_configuration/question_reader_config.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from dataclasses import asdict, dataclass
-
-from yival.data.base_reader import BaseReaderConfig
-
-
-@dataclass
-class QuestionReaderConfig(BaseReaderConfig):
-    """
-    Configuration specific to the questions reader.
-    """
-
-    def asdict(self):
-        return asdict(self)
diff --git a/automation-api/yival_experiments/custom_configuration/simple_evaluator.py b/automation-api/yival_experiments/custom_configuration/simple_evaluator.py
deleted file mode 100644
index 9312b65..0000000
--- a/automation-api/yival_experiments/custom_configuration/simple_evaluator.py
+++ /dev/null
@@ -1,171 +0,0 @@
-"""
-This module defines the SimpleEvaluator class, which is used for
-evaluating string expected results.
-
-Classes:
-    SimpleEvaluator: Class for evaluating string expected
-    results.
-
-"""
-
-import logging
-
-from fuzzywuzzy import fuzz
-from pydantic import BaseModel
-from simple_evaluator_config import SimpleEvaluatorConfig
-from yival.evaluators.base_evaluator import BaseEvaluator
-from yival.schemas.evaluator_config import (
-    EvaluatorOutput,
-    ExpectedResultEvaluatorConfig,
-)
-from yival.schemas.experiment_config import ExperimentResult
-
-logger = logging.getLogger("test")
-
-
-# FIXME: move this class to the lib module.
-class ExperimentInput(BaseModel):
-    question_id: str
-    question_text: str
-    language: str
-    option_a: str
-    option_a_correctness: str
-    option_b: str
-    option_b_correctness: str
-    option_c: str
-    option_c_correctness: str
-
-    class Config:
-        population_by_name = True
-
-
-def get_answers_dict(x: ExperimentInput) -> dict:
-    mapping = {"Correct": 1, "Wrong": 2, "Very Wrong": 3}
-    tpl = list(
-        zip(
-            [x.option_a, x.option_b, x.option_c],
-            [x.option_a_correctness, x.option_b_correctness, x.option_c_correctness],
-        )
-    )
-    tpl.sort(key=lambda x: mapping[x[1]])
-    return {
-        "correct_answer": tpl[0][0],
-        "wrong_answer": tpl[1][0],
-        "very_wrong_answer": tpl[2][0],
-    }
-
-
-def evaluate_text(input_string, correct_answer, wrong_answer, very_wrong_answer):
-    """Grade the result by fuzzy matching the answers."""
-    # Set a threshold for fuzzy matching
-    threshold = 95
-
-    # Function to check if a string contains an answer
-    def contains_answer(text, answer):
-        return fuzz.partial_ratio(text.lower(), answer.lower()) >= threshold
-
-    # Check for each answer type
-    has_correct = contains_answer(input_string, correct_answer)
-    has_wrong = contains_answer(input_string, wrong_answer)
-    has_very_wrong = contains_answer(input_string, very_wrong_answer)
-
-    # Count how many answer types are present
-    answer_count = sum([has_correct, has_wrong, has_very_wrong])
-
-    # Evaluate based on the conditions
-    if answer_count == 1:
-        if has_correct:
-            return 3
-        elif has_wrong:
-            return 2
-        elif has_very_wrong:
-            return 1
-
-    # Return 0 if no answers or multiple answers are present
-    return 0
-
-
-class SimpleEvaluator(BaseEvaluator):
-    """
-    Class for evaluating string expected results.
-
-    This class extends the BaseEvaluator and provides specific implementation
-    for evaluating string expected results using different matching techniques.
-
-    Attributes:
-        config (ExpectedResultEvaluatorConfig): Configuration object for the
-                                                evaluator.
-
-    """
-
-    default_config = SimpleEvaluatorConfig(name="simple_evaluator")
-
-    def __init__(self, config: SimpleEvaluatorConfig):
-        """
-        Initialize the SimpleEvaluator with the provided
-        configuration.
-
-        Args:
-            config (ExpectedResultEvaluatorConfig): Configuration object for
-            the evaluator.
-
-        """
-        super().__init__(config)
-        self.config: SimpleEvaluatorConfig = config
-
-    def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
-        """
-        Evaluate the expected result against the actual result using the
-        specified matching technique.
-
-        Returns:
-            EvaluatorOutput: An EvaluatorOutput object containing the
-            evaluation result.
-
-        """
-        input_data = ExperimentInput(**experiment_result.input_data.content)
-        raw_output = experiment_result.raw_output.text_output
-        answer_dict = get_answers_dict(input_data)
-        result = evaluate_text(raw_output, **answer_dict)
-        return EvaluatorOutput(
-            name=self.config.name,
-            display_name="matching",
-            result=result,
-            metric_calculators=self.config.metric_calculators,
-        )
-
-
-BaseEvaluator.register_evaluator(
-    "simple_evaluator", SimpleEvaluator, ExpectedResultEvaluatorConfig
-)
-
-
-def main():
-
-    from example_evaluator_data import (
-        content,
-        raw_output,
-    )
-    from yival.schemas.experiment_config import (
-        ExperimentResult,
-        InputData,
-        MultimodalOutput,
-    )
-
-    input_data_example = InputData(content=content)
-    experiment_result_example = ExperimentResult(
-        input_data=input_data_example,
-        combination={"wrapper1": "var1"},
-        raw_output=MultimodalOutput(text_output=raw_output),
-        latency=150.0,
-        token_usage=40,
-    )
-
-    evaluator_config = SimpleEvaluatorConfig(name="simple_evaluator")
-    evaluator = SimpleEvaluator(evaluator_config)
-    result = evaluator.evaluate(experiment_result_example)
-    print("Result: ", result.result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/automation-api/yival_experiments/custom_configuration/simple_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/simple_evaluator_config.py
deleted file mode 100644
index fb6050c..0000000
--- a/automation-api/yival_experiments/custom_configuration/simple_evaluator_config.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from dataclasses import asdict, dataclass
-from typing import Any, Dict
-
-from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType
-
-
-@dataclass
-class SimpleEvaluatorConfig(EvaluatorConfig):
-    evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
-    description: str = "This is the description of the evaluator."
-    scale_description: str = "0-4"
-
-    def asdict(self) -> Dict[str, Any]:
-        return asdict(self)
diff --git a/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator.py b/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator.py
deleted file mode 100644
index 29bf3e9..0000000
--- a/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""
-An evaluator that uses Vertex AI's prompt-based system for evaluations.
-
-The evaluator interfaces with the Vertex AI API to present tasks and interpret
-the model's responses to determine the quality or correctness of a given
-experiment result.
-"""
-
-import copy
-import logging
-import os
-
-import litellm
-from evaluator_common import (
-    CLASSIFY_STR,
-    calculate_choice_score,
-    choices_to_string,
-    completion_with_backpff,
-    extract_choice_from_response,
-    format_template,
-)
-from vertex_ai_evaluator_config import VertexAIEvaluatorConfig
-from yival.evaluators.base_evaluator import BaseEvaluator
-from yival.schemas.evaluator_config import (
-    EvaluatorOutput,
-    EvaluatorType,
-    MethodCalculationMethod,
-    MetricCalculatorConfig,
-)
-from yival.schemas.experiment_config import (
-    ExperimentResult,
-    InputData,
-    MultimodalOutput,
-)
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class VertexAIEvaluator(BaseEvaluator):
-    """Evaluator using VertexAI's prompt-based evaluation."""
-
-    default_config = VertexAIEvaluatorConfig(name="vertex_ai_evaluator")  # type: ignore
-
-    def __init__(self, config: VertexAIEvaluatorConfig):
-        super().__init__(config)
-        self.config = config
-        if "claude" in self.config.model_name:
-            self.vertex_location = "us-east5"
-        else:
-            self.vertex_location = "us-central1"
-
-    def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
-        """Evaluate the experiment result using Vertex AI's prompt-based evaluation."""
-        assert isinstance(self.config, VertexAIEvaluatorConfig)
-        format_dict = copy.deepcopy(experiment_result.input_data.content)
-        format_dict["raw_output"] = experiment_result.raw_output.text_output
-
-        prompt = format_template(self.config.prompt, format_dict)
-        if isinstance(prompt, str):
-            prompt = [{"role": "user", "content": prompt}]
-
-        prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format(
-            choices=choices_to_string(self.config.choices)
-        )
-        response = completion_with_backpff(
-            model=self.config.model_name,
-            messages=prompt,
-            temperature=0.0,
-            n=1,
-            max_tokens=2000,
-            request_timeout=60,
-            caching=True,
-            vertex_ai_location=self.vertex_location,
-            vertex_ai_project=os.environ["VERTEXAI_PROJECT"],
-        )
-        # response = openai.ChatCompletion.create(
-        #     model="gpt-4", messages=prompt, temperature=0.5)
-        response_content = response["choices"][0]["message"]["content"]
-        choice = extract_choice_from_response(response_content, self.config.choices)
-        score = calculate_choice_score(choice, self.config.choice_scores)
-        return EvaluatorOutput(
-            name=self.config.name,
-            result=score if score is not None else choice,
-            display_name=self.config.display_name,
-            metric_calculators=self.config.metric_calculators,
-        )
-
-
-BaseEvaluator.register_evaluator(
-    "vertex_ai_evaluator", VertexAIEvaluator, VertexAIEvaluatorConfig
-)
-
-
-def main():
-    """Main function to test the OpenAIPromptBasedEvaluator."""
-    from example_evaluator_data import (
-        choice_scores,
-        choices,
-        content,
-        prompt,
-        raw_output,
-    )
-
-    from lib.config import read_config
-
-    read_config()
-    litellm.set_verbose = True
-
-    evaluator_config = VertexAIEvaluatorConfig(
-        name="gpt4_evaluator",
-        display_name="correctness test",
-        metric_calculators=[
-            MetricCalculatorConfig(
-                MethodCalculationMethod(MethodCalculationMethod.AVERAGE)
-            )
-        ],
-        model_name="vertex_ai/gemini-pro-experimental",
-        prompt=prompt,
-        choices=choices,
-        evaluator_type=EvaluatorType.INDIVIDUAL,
-        choice_scores=choice_scores,
-    )
-
-    input_data_example = InputData(content=content)
-
-    experiment_result_example = ExperimentResult(
-        input_data=input_data_example,
-        combination={"wrapper1": "var1", "wrapper2": "var2"},
-        raw_output=MultimodalOutput(text_output=raw_output),
-        latency=150.0,
-        token_usage=50,
-    )
-
-    evaluator = VertexAIEvaluator(evaluator_config)
-    result = evaluator.evaluate(experiment_result_example)
-    print("Result: ", result.result)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator_config.py
deleted file mode 100644
index 8e81ab5..0000000
--- a/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator_config.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, List, Optional, Union
-
-from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType
-
-
-@dataclass
-class VertexAIEvaluatorConfig(EvaluatorConfig):
-    evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
-    prompt: Union[str, List[Dict[str, str]]] = ""
-    choices: List[str] = field(default_factory=list)
-    model_name: str = "vertex_ai/claude-3-opus@20240229"
-    description: str = "This is the description of the evaluator."
-    scale_description: str = "0-4"
-    choice_scores: Optional[Dict[str, float]] = None
-
-    def asdict(self) -> Dict[str, Any]:
-        return asdict(self)
diff --git a/automation-api/yival_experiments/data/questions_en-US.csv b/automation-api/yival_experiments/data/questions_en-US.csv
deleted file mode 100644
index 83892e3..0000000
--- a/automation-api/yival_experiments/data/questions_en-US.csv
+++ /dev/null
@@ -1,366 +0,0 @@
-question_id,question_text,language,option_a,option_a_correctness,option_b,option_b_correctness,option_c,option_c_correctness,correct_answer
-80,"Since 1990, roughly 2.5 million people were made temporarily homeless in Europe due to natural disasters. The number for Asia during the same period was...",en-US,Roughly the same,Very Wrong,4 times more,Wrong,40 times more,Correct,40 times more
-106,How many people in the world feel safe walking alone at night where they live?,en-US,Less than 30%,Very Wrong,Around 45%,Wrong,More than 60%,Correct,More than 60%
-14,What share of all plastic waste in the world ends up in the oceans?,en-US,Less than 1%,Correct,Around 36%,Wrong,More than 66%,Very Wrong,Less than 1%
-1632,"Globally, what share of deaths is caused by overweight and obesity?",en-US,Less than 10%,Correct,Around 25%,Wrong,Around 40%,Very Wrong,Less than 10%
-1543,"In 2023, how many companies in Africa had revenues of more than 1 billion US Dollars?",en-US,Around 80,Very Wrong,Around 210,Wrong,Around 340,Correct,Around 340
-1,What happened to the global suicide rate in the last 20 years?,en-US,Decreased about 25%,Correct,Stayed about the same,Wrong,Increased about 25%,Very Wrong,Decreased about 25%
-1757,"In 2023, the number of children who died before age five was around 4% worldwide. What was this number back in 1900?",en-US,Around 14%,Very Wrong,Around 20%,Wrong,Around 40%,Correct,Around 40%
-1745,"During the 1990s, the total damage caused by natural disasters was about 0.25% of total world income (GDP). During the last 10 years, that number was…",en-US,Roughly the same,Correct,Two times higher,Wrong,Four times higher,Very Wrong,Roughly the same
-50,How many people in the world have some access to electricity?,en-US,Around 30%,Very Wrong,Around 60%,Wrong,Around 90%,Correct,Around 90%
-25,"For every 100kg of food produced in the world, how much is transported to a different country?",en-US,17kg,Correct,37kg,Wrong,57kg,Very Wrong,17kg
-42,"Up to 1990, 22 countries in the world had been led by a female head of state or government. What is that number today?",en-US,39,Very Wrong,59,Wrong,89,Correct,89
-1391,How many countries have made cooperation agreements with China's Belt and Road Initiative?,en-US,Around 40,Very Wrong,Around 80,Wrong,Around 140,Correct,Around 140
-1517,Child deaths in Africa today are at the same levels as in Europe in:,en-US,1850,Very Wrong,1900,Wrong,1950,Correct,1950
-59,During the past 40 years the total amount of oil and natural gas in known underground reserves:,en-US,Reduced to less than half,Very Wrong,Remained about the same,Wrong,More than doubled,Correct,More than doubled
-1589,"In 1990, 39% of the global labour force was female. What is this value today?",en-US,Around 39%,Correct,Around 42%,Wrong,Around 46%,Very Wrong,Around 39%
-11,"In 1990, 58% of the world's population lived in low-income countries. What is the share today?",en-US,Around 9%,Correct,Around 37%,Wrong,Around 61%,Very Wrong,Around 9%
-1755,"Of all girls aged 6 to 11 in the world, how many go to school?",en-US,Less than 30%,Very Wrong,Around 60%,Wrong,Around 90%,Correct,Around 90%
-1528,"How many people worldwide have their basic needs met when it comes to food, water, toilets, electricity, schooling and healthcare?",en-US,Around 25%,Very Wrong,Around 55%,Wrong,Around 85%,Correct,Around 85%
-58,What happened to the total amount of raw materials used across the world annually since 2000?,en-US,Stayed about the same,Very Wrong,Increased about 35%,Wrong,Increased about 70%,Correct,Increased about 70%
-32,In which countries are people on average least satisfied with their lives?,en-US,Low-income countries,Correct,Middle-income countries,Wrong,High-income countries,Very Wrong,Low-income countries
-15,What share of the world's population are international refugees?,en-US,Around 0.6%,Correct,Around 6%,Wrong,Around 16%,Very Wrong,Around 0.6%
-1498,"For all vertebrate species (animals with skeletons), how much did their wild populations decline on average during the past 50 years?",en-US,Around 10% decline,Very Wrong,Around 40% decline,Wrong,Around 70% decline,Correct,Around 70% decline
-10,How many companies in the world have a woman as top manager or CEO?,en-US,Around 2%,Very Wrong,Around 10%,Wrong,Around 18%,Correct,Around 18%
-72,"In 1990, 3% of the world’s population lived in a different country than where they were born. What is that share today?",en-US,4%,Correct,14%,Wrong,24%,Very Wrong,4%
-97,What group of animals has the highest share of threatened species?,en-US,Birds,Wrong,Mammals,Very Wrong,Amphibians,Correct,Amphibians
-1521,"When biologists started counting fish in the oceans in 1950, around 1% of existing fish stocks were overexploited. By 2019 this share was...",en-US,...around 5%,Very Wrong,...around 20%,Wrong,...more than 35%,Correct,...more than 35%
-1717,"Countries with incomes like India, Morocco and Bolivia, emit how much CO2 per person, compared to high-income countries?",en-US,80% less,Correct,40% less,Wrong,20% more,Very Wrong,80% less
-1764,What happened to CO2 emissions from the EU and USA since 2005?,en-US,They decreased 20%,Correct,They stayed the same,Wrong,They increased 20%,Very Wrong,They decreased 20%
-79,Megacities are cities with more than 10 million inhabitants. What share of the world's population is expected to live in megacities in 2030?,en-US,Around 9%,Correct,Around 39%,Wrong,Around 69%,Very Wrong,Around 9%
-2,How did the number of deaths per year from natural disasters change over the last hundred years?,en-US,More than doubled,Very Wrong,Remained about the same,Wrong,Decreased to less than half,Correct,Decreased to less than half
-4,What share of the population in high-income countries (like Germany and the USA) live in extreme poverty (with less than $2/day)?,en-US,Less than 1%,Correct,Around 11%,Wrong,Around 21%,Very Wrong,Less than 1%
-111,"Of all financial aid received by the least developed countries, how much comes from charity and philanthropy?",en-US,Around 5%,Correct,Around 25%,Wrong,Around 50%,Very Wrong,Around 5%
-1794,"Of all greenhouse gas emissions from the global food system, how much comes from transporting food?",en-US,Around 6%,Correct,Around 36%,Wrong,Around 66%,Very Wrong,Around 6%
-21,"In 1980, roughly 40% of the world's population lived in extreme poverty, with less than $2 per day. What is the share today?",en-US,Around 10%,Correct,Around 30%,Wrong,Around 50%,Very Wrong,Around 10%
-37,"Worldwide, how many children under age 15 do not achieve the required minimum skills in reading and math?",en-US,Around 10%,Very Wrong,Around 30%,Wrong,Around 50%,Correct,Around 50%
-1510,"Of all children aged 6 to 11 in the world, how many go to school?",en-US,Less than 25%,Very Wrong,Around 60%,Wrong,More than 85%,Correct,More than 85%
-1758,"Of all money earned in the world, how much comes from services, such as administration, banking, care, teaching, transport and entertainment?",en-US,Around 30%,Very Wrong,Around 45%,Wrong,More than 60%,Correct,More than 60%
-1499,"In the 1950s, 50% of all wars occurred between countries that are recognized as sovereign states by the UN. What is that number today?",en-US,Around 5%,Correct,Around 25%,Wrong,Around 55%,Very Wrong,Around 5%
-53,"Globally, around 160 million children are used for child labor. In what sector do the majority of them work?",en-US,Industry,Very Wrong,Agriculture,Correct,Services,Wrong,Agriculture
-17,What share of countries in the world have laws against sexual harassment at work?,en-US,Around 30%,Very Wrong,Around 50%,Wrong,Around 70%,Correct,Around 70%
-18,"How much of the world's economy comes from agriculture, forestry and fishing?",en-US,Around 5%,Correct,Around 25%,Wrong,Around 45%,Very Wrong,Around 5%
-9,"Of all energy used in the world, how much comes from natural gas, coal and oil?",en-US,Around 40%,Very Wrong,Around 60%,Wrong,Around 80%,Correct,Around 80%
-6,How many people in the world have access to basic drinking water within 30 minutes of their home?,en-US,Around 50%,Very Wrong,Around 70%,Wrong,Around 90%,Correct,Around 90%
-3,"In low-income countries across the world in 2022, what share of girls went to school until at least age 11?",en-US,Around 20%,Very Wrong,Around 40%,Wrong,Around 60%,Correct,Around 60%
-13,What share of the world’s population lives in megacities (cities with at least 10 million people)?,en-US,Around 8%,Correct,Around 28%,Wrong,Around 48%,Very Wrong,Around 8%
-8,"Biologists have evaluated the status of more than 150,000 species of plants and animals. How many are endangered or threatened?",en-US,Around 30%,Correct,Around 60%,Wrong,Around 90%,Very Wrong,Around 30%
-19,The governments of high-income countries get how much of their revenue from customs and import duties?,en-US,Around 2%,Correct,Around 12%,Wrong,Around 22%,Very Wrong,Around 2%
-5,How much of the excess heat from global warming is captured in the oceans?,en-US,Around 9%,Very Wrong,Around 49%,Wrong,Around 89%,Correct,Around 89%
-12,What share of the world’s population don't have enough food to meet their daily energy needs?,en-US,Around 11%,Correct,Around 23%,Wrong,Around 37%,Very Wrong,Around 11%
-1793,"How much of all the money earned in sub-Saharan Africa comes from agriculture, forestry and fishing?",en-US,Around 20%,Correct,Around 40%,Wrong,Around 60%,Very Wrong,Around 20%
-1792,"If POOR means people with less than $2/day, and RICH means more than $200/day. Today the number of POOR per RICH is…",en-US,…smaller than ever,Correct,…the same as always,Wrong,…larger than ever,Very Wrong,…smaller than ever
-1791,"ReliefWeb is the UN's service to coordinate disaster relief work worldwide, day and night. How many humanitarian situation reports did it publish in 2023?",en-US,Around 100,Very Wrong,"Around 1,000",Wrong,"Around 10,000",Correct,"Around 10,000"
-1790,How much of all greenhouse gas emissions come from transport?,en-US,Around 16%,Correct,Around 36%,Wrong,Around 56%,Very Wrong,Around 16%
-1786,"Since 1961, the land used for crop agriculture worldwide has increased by 13%. What has happened to the annual amount of grains (corn, wheat, rice etc) produced?",en-US,It decreased 10%,Very Wrong,Stayed about the same,Wrong,It increased 240%,Correct,It increased 240%
-1789,"Since 1961, the land used for crop agriculture worldwide has increased by 13%. What has happened to the annual amount of grains (corn, wheat, rice etc) produced?",en-US,Decreased 10%,Very Wrong,Stayed about the same,Wrong,Increased 240%,Correct,Increased 240%
-1788,What share of Africa's population aren't able to access enough nutritious food every day?,en-US,Around 60%,Correct,Around 75%,Wrong,Around 95%,Very Wrong,Around 60%
-1782,"Of all deaths in Africa, what share is caused by a lack of food?",en-US,Around 2%,Correct,Around 32%,Wrong,Around 62%,Very Wrong,Around 2%
-1783,"Of the 195 countries in the world, how many have some kind of school feeding program?",en-US,Fewer than 20,Very Wrong,Around 70,Wrong,More than 150,Correct,More than 150
-1784,"In the United States and Europe, how many children are covered by some kind of social protection program in case their parents can't give them food?",en-US,Less than 15%,Very Wrong,Around 50%,Wrong,More than 85%,Correct,More than 85%
-1766,"What happened to the average amount of food produced from a field with potatoes, cassava, maize, rice and wheat since 1960?",en-US,Dropped to less than half,Very Wrong,Stayed more or less the same,Wrong,It nearly doubled,Correct,It nearly doubled
-1604,How many children in Europe (including Russia and Türkiye) are obese today?,en-US,Around 10%,Correct,Around 25%,Wrong,Around 40%,Very Wrong,Around 10%
-1513,"Between 2005 and 2020, $57 billion was spent on food aid for poor countries. How much was spent researching new crops that could yield more food or survive extreme weather?",en-US,$9 billion,Correct,$39 billion,Wrong,$69 billion,Very Wrong,$9 billion
-1511,How much of the food eaten by people in Africa is produced in Africa?,en-US,Less than 20%,Very Wrong,Around 50%,Wrong,More than 80%,Correct,More than 80%
-27,How many countries worldwide have holdings of plant genetic materials conserved in genebanks?,en-US,Less than 10,Very Wrong,Around 50,Wrong,Around 100,Correct,Around 100
-29,"In 1995, all countries together spent 4,600 million US dollars on agriculture export subsidies. How much was spent in 2019?",en-US,100 million USD,Correct,"1,000 million USD",Wrong,"10,000 million USD",Very Wrong,100 million USD
-26,"Worldwide, how many children under age 5 are overweight?",en-US,6%,Correct,26%,Wrong,46%,Very Wrong,6%
-28,Which of the following regions has the largest share of children under 5 who are dangerously underweight?,en-US,North Africa & Middle East,Very Wrong,South Asia,Correct,Sub-Saharan Africa,Wrong,South Asia
-1787,"How much of the world's economy comes from agriculture, forestry and fishing?",en-US,Around 4%,Correct,Around 24%,Wrong,Around 44%,Very Wrong,Around 4%
-1785,Which of the following regions has the highest share of female researchers?,en-US,Europe,Wrong,Central Asia,Correct,North America,Very Wrong,Central Asia
-1781,What share of people in high-income countries can't afford enough food for their daily energy needs?,en-US,Less than 2%,Correct,Around 12%,Wrong,Around 22%,Very Wrong,Less than 2%
-1780,What share of the world's population aren't able to access enough nutritious food every day?,en-US,Around 30%,Correct,Around 50%,Wrong,Around 70%,Very Wrong,Around 30%
-1550,What percentage of the world's population lives in countries where women on average have 5 or more babies?,en-US,Around 5%,Correct,Around 35%,Wrong,Around 65%,Very Wrong,Around 5%
-1571,"Worldwide, what percentage of women in stable relationships who want to use contraceptives, don't have access to them?",en-US,10%,Correct,30%,Wrong,50%,Very Wrong,10%
-1763,In how many of the world’s 195 countries do women NOT have the right to vote?,en-US,1,Correct,23,Wrong,46,Very Wrong,1
-1748,How many girls are married by the age of 15 in Sub-Saharan Africa?,en-US,Around 10%,Correct,Around 30%,Wrong,Around 50%,Very Wrong,Around 10%
-1719,What share of all countries have some kind of law to protect women against domestic violence?,en-US,Around 25%,Very Wrong,Around 50%,Wrong,Around 75%,Correct,Around 75%
-1553,"Of all men aged 25-29 worldwide, about 90% are in the labor force (have a job or are seeking one). What’s the number for women?",en-US,Around 60%,Correct,Around 70%,Wrong,Around 80%,Very Wrong,Around 60%
-1652,How many countries have laws which say that men and women should be paid the same amount for doing work of equal value?,en-US,Around 10%,Very Wrong,Around 25%,Wrong,Around 40%,Correct,Around 40%
-1650,In how many countries is the highest political leader a woman?,en-US,Around 5,Very Wrong,Around 10,Wrong,Around 15,Correct,Around 15
-1619,How many women are married before the age of 18 in Sub-Saharan Africa?,en-US,Around 30%,Correct,Around 50%,Wrong,Around 70%,Very Wrong,Around 30%
-1574,In how many countries (out of 195) do married men and women NOT have equal legal rights to own land or houses?,en-US,Around 20,Correct,Around 80,Wrong,Around 140,Very Wrong,Around 20
-1500,"Across the world, women do more unpaid care and domestic work than men. How much more?",en-US,50% more,Very Wrong,Around twice as much,Wrong,Around three times more,Correct,Around three times more
-43,"In how many countries, out of 195, is marriage under age 18 legally possible?",en-US,13,Very Wrong,73,Wrong,113,Correct,113
-41,"Worldwide, which of these contraceptive methods is the most used by women?",en-US,Pill,Very Wrong,Sterilization,Correct,Intrauterine Device (IUD),Wrong,Sterilization
-44,"Of 195 countries, how many have signed the UN Convention on the Elimination of All Forms of Discrimination against Women?",en-US,50,Very Wrong,120,Wrong,190,Correct,190
-40,"Worldwide, what share of seats in national parliaments are held by women today?",en-US,Around 5%,Very Wrong,Around 15%,Wrong,Around 25%,Correct,Around 25%
-1611,What share of the world's population lives in middle-income countries today?,en-US,Around 25%,Very Wrong,Around 50%,Wrong,Around 75%,Correct,Around 75%
-1718,Around half of the world's population is below:,en-US,33 years of age,Correct,43 years of age,Wrong,53 years of age,Very Wrong,33 years of age
-76,"Today, more than 5 times more people live in towns and cities than in 1950. What happened to the rural population?",en-US,Almost halved,Very Wrong,Stayed about the same,Wrong,Almost doubled,Correct,Almost doubled
-1567,How many of the world’s 195 countries do UN experts expect to decrease in population size between now and 2050?,en-US,Around 10 countries,Very Wrong,Around 30 countries,Wrong,Around 50 countries,Correct,Around 50 countries
-1654,It took 12 years for the world's population to grow from 6 to 7 billion people. How long did it take from 7 to 8 billion?,en-US,2 years,Very Wrong,7 years,Wrong,12 years,Correct,12 years
-1631,"Thirty years ago, less than 25% of the world's population lived in middle-income countries. What is the share today?",en-US,Around 15%,Very Wrong,Around 45%,Wrong,Around 75%,Correct,Around 75%
-1627,"What share of all countries will have fewer people in 2050 than they have today, if current population trends continue?",en-US,5%,Very Wrong,15%,Wrong,25%,Correct,25%
-1622,There are currently 8 billion people on Earth. How many people do UN experts think there will be in 2100?,en-US,Around 10.5 billion,Correct,Around 16.5 billion,Wrong,Around 22.5 billion,Very Wrong,Around 10.5 billion
-56,"Today, 65% of the world’s population are of working age (15-64 years old). What do UN experts think this number will be in 2100?",en-US,50%,Wrong,60%,Correct,70%,Very Wrong,60%
-1603,"By 2050, the number of old people (age 65+) will double everywhere, except:",en-US,Low-income countries,Very Wrong,Middle-income countries,Wrong,High-income countries,Correct,High-income countries
-1593,The total number of old people worldwide (age 65+) is expected to increase by 800 million by 2050. How much of the increase will happen in high-income countries?,en-US,15%,Correct,35%,Wrong,55%,Very Wrong,15%
-1506,"There are 2 billion children in the world today, aged 0-14 years old. How many children will there be in the year 2100 according to the United Nations?",en-US,4 billion,Very Wrong,3 billion,Wrong,2 billion,Correct,2 billion
-1504,What share of the world’s population lives in countries where women on average have less than 3 babies?,en-US,40%,Very Wrong,60%,Wrong,80%,Correct,80%
-1505,"According to the United Nations, the world population will increase by another 2.4 billion people by 2100. The main reason is that there will be more…",en-US,children (below age 15),Very Wrong,adults (age 15 to 74),Correct,very old (above age 75),Wrong,adults (age 15 to 74)
-1779,"Of all people living in extreme poverty worldwide (on less than $2 a day), how many are refugees or people living in urban slums?",en-US,Around 20%,Correct,Around 45%,Wrong,Around 70%,Very Wrong,Around 20%
-1778,"What share of the world’s population live in countries where the majority are extremely poor, living with less than $2 a day?",en-US,Around 5%,Correct,Around 25%,Wrong,Around 45%,Very Wrong,Around 5%
-1777,"Of all the people in Middle-Income Countries, how many live in extreme poverty (with less than $2 a day)?",en-US,Around 7%,Correct,Around 22%,Wrong,Around 37%,Very Wrong,Around 7%
-1663,"Among university-educated refugees in the EU in 2019 who were employed, how many of them were overqualified for their jobs?",en-US,Around 20%,Very Wrong,Around 40%,Wrong,Around 60%,Correct,Around 60%
-1642,"When UNICEF bought child vaccines in 2018, what price did they pay on average, compared to the price paid by high-income countries like Germany and USA?",en-US,More than twice the price,Very Wrong,Roughly half the price,Wrong,Less than 20% of the price,Correct,Less than 20% of the price
-52,"In 1990, 19% of people aged 65 and older in the world still worked. What is the share expected to be in 2030?",en-US,Around 21%,Correct,Around 41%,Wrong,Around 61%,Very Wrong,Around 21%
-62,What share of all workers worldwide are employed in the manufacturing sector?,en-US,Around 15%,Correct,Around 35%,Wrong,Around 55%,Very Wrong,Around 15%
-1545,Which of the following causes the most deaths in Africa?,en-US,Conflict and war,Very Wrong,Heart disease and stroke,Correct,HIV / AIDS,Wrong,Heart disease and stroke
-1613,"Worldwide, what causes more deaths?",en-US,Cancer,Wrong,Heart disease and strokes,Correct,Lack of food,Very Wrong,Heart disease and strokes
-1607,How many people in the European Union currently suffer from depression?,en-US,Around 5%,Correct,Around 25%,Wrong,Around 45%,Very Wrong,Around 5%
-1672,"In 1990, around 60% of all electricity worldwide was produced using fossil fuels. What is that number today?",en-US,Around 40%,Very Wrong,Around 50%,Wrong,Around 60%,Correct,Around 60%
-1524,"In 2023, how much fossil fuels (oil, coal and natural gas) were used in the world, compared to the year 2000?",en-US,95% (some less),Very Wrong,120% (some more),Wrong,145% (much more),Correct,145% (much more)
-1720,"Worldwide, how many people in rural areas have access to drinking water within 30 minutes of their home?",en-US,Less than 25%,Very Wrong,Around 50%,Wrong,More than 75%,Correct,More than 75%
-109,"In 2000, low- and middle-income countries on average spent almost 6% of their annual income on their government's loans. What was this number in 2022?",en-US,Around 4%,Correct,Around 8%,Wrong,Around 12%,Very Wrong,Around 4%
-60,How many people in the world have a mobile phone subscription?,en-US,Around 78%,Correct,Around 85%,Wrong,Around 92%,Very Wrong,Around 78%
-1676,"In 2022, the UN spent around $6.4 billion to help refugees worldwide. How much did Western European governments spend to help refugees within West Europe?",en-US,Less than $3 billion,Very Wrong,Around $5 billion,Wrong,More than $20 billion,Correct,More than $20 billion
-1731,"In 2022, how much of the increased electricity production in the world came from renewable sources?",en-US,Around 10%,Very Wrong,Around 45%,Wrong,Around 85%,Correct,Around 85%
-1646,"How much of all the money earned in low-income countries comes from agriculture, forestry and fishing?",en-US,Around 25%,Correct,Around 50%,Wrong,Around 75%,Very Wrong,Around 25%
-1523,"Roughly what is the minimum wage in these countries with lots of textile factories, India, China, Pakistan, Vietnam, Cambodia, Myanmar and Indonesia?",en-US,Around $0.2 a day,Very Wrong,Around $1 a day,Wrong,Around $10 a day,Correct,Around $10 a day
-7,How many people in the world live in areas that are 5 meters or less above sea level?,en-US,Around 11%,Correct,Around 31%,Wrong,Around 51%,Very Wrong,Around 11%
-85,"What happens to the average global temperature if we halve the annual net emissions of CO2, today?",en-US,It decreases,Very Wrong,It stays the same,Wrong,It keeps increasing,Correct,It keeps increasing
-1636,"Compared to the year before, how much less carbon dioxide was emitted worldwide during 2020 because of the Coronavirus Pandemic?",en-US,Around 5% less,Correct,Around 20% less,Wrong,Around 40% less,Very Wrong,Around 5% less
-1620,What share of the total global economy comes from tourism?,en-US,Around 10%,Correct,Around 25%,Wrong,Around 40%,Very Wrong,Around 10%
-1563,What share of all greenhouse gas emissions come from airplanes?,en-US,Less than 6%,Correct,Around 16%,Wrong,More than 26%,Very Wrong,Less than 6%
-1706,Electronics and pharmaceutical companies spend around 12% of their revenue on research and development. How much do energy and fuel producing companies spend?,en-US,Around 0.3%,Correct,Around 3%,Wrong,Around 30%,Very Wrong,Around 0.3%
-1776,How many jobs in the US are supported by exports to China?,en-US,"Around 100,000",Very Wrong,"Around 500,000",Wrong,Around 1 million,Correct,Around 1 million
-1775,What was the top US service export to China in 2021?,en-US,Financial Services,Wrong,Education,Correct,Computer software services,Very Wrong,Education
-1737,How many people in high-income countries see climate change as a threat to their country over the next 20 years?,en-US,Around 20%,Very Wrong,Around 50%,Wrong,Around 80%,Correct,Around 80%
-1691,"Of all energy used in the world, what share is electricity?",en-US,Around 20%,Correct,Around 50%,Wrong,Around 80%,Very Wrong,Around 20%
-1584,"In 2016, around 80 countries agreed to work jointly to stop companies avoiding tax by shifting profits to low-tax locations. How many countries are members today?",en-US,Around 50,Very Wrong,Around 90,Wrong,Around 140,Correct,Around 140
-1542,Worldwide about 30% of researchers are women. What is the share of female researchers in Sub-Saharan Africa?,en-US,Around 10%,Very Wrong,Around 20%,Wrong,Around 30%,Correct,Around 30%
-1716,Which of these free-trade areas includes the most countries?,en-US,North American Free Trade Agreement,Very Wrong,African Continental Free Trade Area,Correct,The European Union,Wrong,African Continental Free Trade Area
-1502,"The United Nations’ Sustainable Development Goals have a total of 169 targets. How many of them mention the words ""democracy"" or ""democratic""?",en-US,0 targets,Correct,14 targets,Wrong,34 targets,Very Wrong,0 targets
-64,What share of the world’s population used the Internet in 2023?,en-US,Around 65%,Correct,Around 80%,Wrong,Around 95%,Very Wrong,Around 65%
-105,What share of the members of the United Nations General Assembly are low- and middle-income countries?,en-US,Around 20%,Very Wrong,Around 45%,Wrong,Around 70%,Correct,Around 70%
-1680,How many countries have satellites in orbit around the earth?,en-US,Around 15,Very Wrong,Around 40,Wrong,Around 80,Correct,Around 80
-1661,How many refugees currently live in the world's largest refugee camp?,en-US,"Around 9,000",Very Wrong,"Around 90,000",Wrong,"Around 900,000",Correct,"Around 900,000"
-70,"When counting the number of refugees in the world, the number includes people who have...",en-US,Fled to a different country,Correct,Fled to another part of their country,Very Wrong,Both of the above,Wrong,Fled to a different country
-69,"Of all refugees worldwide, in 2023, what share had been in exile for more than four years?",en-US,Less than 30%,Very Wrong,Around 45%,Wrong,More than 60%,Correct,More than 60%
-66,What share of all migrants in the world are refugees?,en-US,Around 15%,Correct,Around 25%,Wrong,Around 35%,Very Wrong,Around 15%
-65,Which of these countries hosts the largest share of refugees in relation to its population?,en-US,Germany,Wrong,Lebanon,Correct,Sweden,Very Wrong,Lebanon
-1761,"Every year, energy from the sun (light and heat) reaches the earth. Compared to that energy, how much energy leaves the Earth out into space every year?",en-US,30% less than what comes in,Wrong,0.3% less than what comes in,Correct,30% more than what comes in,Very Wrong,0.3% less than what comes in
-1730,"In high-income countries in 2021, the poorer half of the population emitted roughly 5 tonnes of CO2 per person from burning fossil fuels. How much was emitted per person, by the richest 10%?",en-US,Half as much,Very Wrong,Twice as much,Wrong,Eight times more,Correct,Eight times more
-86,How many of the 195 countries recognized by the UN have adopted the Paris Agreement on climate change and the environment?,en-US,92,Very Wrong,142,Wrong,192,Correct,192
-1738,"From 2015 to 2021, the world population increased by 80 million each year. During the same period, how many more people gained some access to electricity each year?",en-US,Around 15 million,Very Wrong,Around 40 million,Wrong,Around 120 million,Correct,Around 120 million
-1741,"Of all greenhouse gas emissions from human activities, how much come from burning fossil fuels?",en-US,Around 25%,Very Wrong,Around 45%,Wrong,Around 65%,Correct,Around 65%
-1715,What is it all greenhouse gases do?,en-US,Pollute air,Very Wrong,Absorb heat,Correct,Reflect light and heat,Wrong,Absorb heat
-1689,"Of all energy used in the world in 2000, around 86% came from burning fossil fuels (oil, coal and gas). What is the number today?",en-US,Around 52%,Very Wrong,Around 64%,Wrong,Around 82%,Correct,Around 82%
-1736,Which group of countries below produces the highest share of electricity from renewable sources?,en-US,"United States, United Kingdom, The Netherlands",Very Wrong,"Brazil, Kenya, Costa Rica",Correct,"Finland, Sweden, Spain",Wrong,"Brazil, Kenya, Costa Rica"
-23,"Of all people aged 65 or older in high-income countries, how many live below the poverty line?",en-US,14%,Correct,29%,Wrong,44%,Very Wrong,14%
-57,What share of people above retirement age worldwide are entitled to a pension?,en-US,Around 25%,Very Wrong,Around 45%,Wrong,Around 75%,Correct,Around 75%
-1769,"Of all children in high-income countries who had cancer back in 1975, around 58% survived more than 5 years. What is that number today?",en-US,Around 60%,Very Wrong,Around 70%,Wrong,More than 80%,Correct,More than 80%
-1678,"In 2022, many Western European governments spent more than $20 per person per day, to help newly arrived refugees.
-How much did the United Nations spend per refugee per day on average, in the rest of the world?",en-US,Around $1,Correct,Around $25,Wrong,Around $45,Very Wrong,Around $1
-71,"In what part of the world is income inequality the smallest, when measuring how much the richest 10% earn of all income?",en-US,Middle East,Very Wrong,Europe,Correct,Sub-Saharan Africa,Wrong,Europe
-1771,What happened to the global maternal mortality rate between 2000 and 2020 (the rate of mothers dying while giving birth or shortly after)?,en-US,It declined about 30%,Correct,It declined about 10%,Wrong,It stayed about the same,Very Wrong,It declined about 30%
-1770,"Back in 1986, countries with nuclear weapons had around 64,000 warheads stockpiled. What is that number today?",en-US,80% less,Correct,About the same,Wrong,50% more,Very Wrong,80% less
-96,What happened to the number of new tuberculosis cases worldwide between 2000 and 2022?,en-US,Increased 20%,Very Wrong,Stayed about the same,Wrong,Decreased 20%,Correct,Decreased 20%
-1767,"Compared to 30 years ago, the pollution of microplastics in the world's oceans today is roughly:",en-US,8 times higher,Correct,80 times higher,Wrong,800 times higher,Very Wrong,8 times higher
-1722,"Since agreements were made in 1987 to limit ozone-depleting substances, how many have been phased out?",en-US,Less than 40%,Very Wrong,Around 70%,Wrong,More than 95%,Correct,More than 95%
-1640,"Globally, an income of less than $2 a day is considered extreme poverty. In the Nordics (Sweden, Norway, Denmark, Finland, Iceland) national poverty lines are roughly:",en-US,$10 a day,Very Wrong,$20 a day,Wrong,$30 a day,Correct,$30 a day
-31,"Worldwide, how many babies are born with a trained health worker present?",en-US,Less than 30%,Very Wrong,Around 50%,Wrong,More than 80%,Correct,More than 80%
-51,"In 1950, 0% of all energy consumption came from nuclear energy. What is that number today?",en-US,Around 5%,Correct,Around 25%,Wrong,Around 45%,Very Wrong,Around 5%
-120,How many cases of smallpox are expected in the world this year?,en-US,Zero cases,Correct,One hundred thousand cases,Wrong,One million cases,Very Wrong,Zero cases
-1509,How many adults in the world say they can read?,en-US,Less than 40%,Very Wrong,Around 60%,Wrong,More than 80%,Correct,More than 80%
-1753,In how many countries was slavery still legal in 1950? (Meaning there was no law or constitution banning the ownership of another person.),en-US,58,Very Wrong,98,Wrong,138,Correct,138
-1516,"Worldwide, men older than 25 have spent 8.9 years in school, on average. How many years have women of the same age spent in school?",en-US,2.4 years,Very Wrong,5.4 years,Wrong,8.4 years,Correct,8.4 years
-103,How many people in the world say they have confidence in their local police?,en-US,Less than 15%,Very Wrong,Around 25%,Wrong,More than 50%,Correct,More than 50%
-104,Which group of countries sent the most troops on UN peacekeeping missions during the past 5 years?,en-US,"Germany, Sweden, Netherlands, Ireland",Wrong,"Ethiopia, Rwanda, Bangladesh, India, Nepal",Correct,"France, USA, Japan, S Korea, Switzerland, UK",Very Wrong,"Ethiopia, Rwanda, Bangladesh, India, Nepal"
-118,"In the last 20 years, the proportion of people living in extreme poverty has...",en-US,More than halved,Correct,Remained more or less the same,Wrong,Almost doubled,Very Wrong,More than halved
-33,How many of the world's 1-year-old children were vaccinated against some disease in 2022?,en-US,Less than 25%,Very Wrong,Around 55%,Wrong,More than 85%,Correct,More than 85%
-89,What happened to the annual number of oil spills from tankers worldwide since the 1970s?,en-US,Decreased tenfold,Correct,Stayed about the same,Wrong,Increased tenfold,Very Wrong,Decreased tenfold
-91,"Globally, people eat an average of 6kg of beef and veal a year. How much fish is consumed on average per person?",en-US,Around 3kg,Very Wrong,Around 6kg,Wrong,Around 10kg,Correct,Around 10kg
-92,"Since 2016, what has happened to the share of marine protected areas in national waters worldwide?",en-US,Decreased by about 75%,Very Wrong,Stayed about the same,Wrong,Increased by about 75%,Correct,Increased by about 75%
-1768,In how many countries did the murder rate decrease between 2010 and 2021? (There are 195 countries.),en-US,Fewer than 15,Very Wrong,Around 50,Wrong,More than 85,Correct,More than 85
-1708,"Between 2012, and 2022, what happened to the number of people killed by terrorists?",en-US,25% decrease,Correct,About the same,Wrong,25% increase,Very Wrong,25% decrease
-1699,"In the year 2000 there were 92 journalists in prison, worldwide. How many were there in 2022?",en-US,About the same,Very Wrong,Around 260,Wrong,Around 360,Correct,Around 360
-1600,What share of all recorded homicides in the world are related to gangs and organized crime?,en-US,Around 20%,Correct,Around 40%,Wrong,Around 60%,Very Wrong,Around 20%
-1633,"70% of Europeans said they were planning to switch to a more environmentally friendly energy provider to fight climate change, in 2020. What was this number in China?",en-US,34%,Very Wrong,64%,Wrong,94%,Correct,94%
-1765,"Compared to the year 2000, how many species of animals, plants and fungi have been assessed in the wild and given a conservation status?",en-US,7% more,Very Wrong,70% more,Wrong,700% more,Correct,700% more
-55,"Compared to 1980, the cost of energy from solar panels today is roughly:",en-US,1% of the 1980 cost,Correct,21% of the 1980 cost,Wrong,41% of the 1980 cost,Very Wrong,1% of the 1980 cost
-1669,"Out of 195 countries, how many have banned gasoline containing lead?",en-US,75 countries,Very Wrong,135 countries,Wrong,195 countries,Correct,195 countries
-1580,"Of all children (5-17 years old) in the world, how many are exploited for child labor?",en-US,Around 10%,Correct,Around 30%,Wrong,Around 50%,Very Wrong,Around 10%
-1726,"Between 2001 and today, how many countries progressed from low-income to middle-income status?",en-US,Fewer than 5,Very Wrong,Around 15,Wrong,More than 30,Correct,More than 30
-45,"Worldwide, how many people living in rural areas use surface water (such as lakes, rivers and streams) as their drinking water?",en-US,Less than 10%,Correct,Around 30%,Wrong,More than 60%,Very Wrong,Less than 10%
-46,"Worldwide, how many people have no toilet of any kind, and instead have to use bushes, fields or streets?",en-US,Roughly 10%,Correct,Roughly 30%,Wrong,Roughly 50%,Very Wrong,Roughly 10%
-47,"Of all the freshwater used in the world, how much goes to agriculture?",en-US,Around 30%,Very Wrong,Around 50%,Wrong,Around 70%,Correct,Around 70%
-48,"How many countries (out of 195) have at least one desalination plant, removing salt from salt water?",en-US,30,Very Wrong,100,Wrong,180,Correct,180
-49,How many countries have rules requiring that local communities are included when planning and managing freshwater resources?,en-US,30%,Very Wrong,50%,Wrong,70%,Correct,70%
-1576,"Worldwide, how many people living in rural areas have a toilet that they don't have to share with other households?",en-US,Around 20%,Very Wrong,Around 40%,Wrong,Around 60%,Correct,Around 60%
-1577,How many people in the world have soap and water to wash their hands at home?,en-US,Around 20%,Very Wrong,Around 40%,Wrong,Around 70%,Correct,Around 70%
-1626,How many people in the world have to make a round trip of more than 30 minutes to collect drinking water?,en-US,Around 10%,Correct,Around 25%,Wrong,Around 50%,Very Wrong,Around 10%
-1562,What share of the world population uses toilets connected to sewers?,en-US,Around 40%,Correct,Around 55%,Wrong,Around 70%,Very Wrong,Around 40%
-54,"Of all renewable energy used in the world today, what share comes from traditional burning of biomass like charcoal, wood and agricultural waste?",en-US,Around 10%,Very Wrong,Around 25%,Wrong,Around 40%,Correct,Around 40%
-1637,"What share of all energy used in the world comes from the modern renewable sources (solar, wind, hydro and modern biofuels)?",en-US,Less than 10%,Correct,Around 20%,Wrong,Around 35%,Very Wrong,Less than 10%
-1638,"Of the total cost of electricity worldwide, how much is the transfer from the power station to the user, on average?",en-US,Less than 1%,Very Wrong,Around 15%,Wrong,Around 30%,Correct,Around 30%
-1682,"Since 1970, what happened to the amount of energy consumed per person globally?",en-US,Stayed more or less the same,Wrong,It increased by around 50%,Correct,It increased by around 250%,Very Wrong,It increased by around 50%
-1692,"Compared to 100 years ago, how affordable is electricity in the US today?",en-US,About the same,Very Wrong,10 times more affordable,Wrong,200 times more affordable,Correct,200 times more affordable
-1750,"In 2000, around 50% of the world's population didn't have modern stoves. Instead they cooked food by burning wood, charcoal or dung. What is the share today?",en-US,Around 30%,Correct,Around 40%,Wrong,Around 50%,Very Wrong,Around 30%
-1522,Since 1970 the average income in the 40 richest countries more than doubled. The average income in the other countries…?,en-US,Declined to half,Very Wrong,Stayed roughly the same,Wrong,More than doubled,Correct,More than doubled
-1578,"Worldwide, what share of young people (aged 15-24) that work are in informal employment?",en-US,Around 25%,Very Wrong,Around 50%,Wrong,Around 75%,Correct,Around 75%
-1587,"How many young men in the world (aged 15-24) are not engaged in education, employment or training?",en-US,Around 15%,Correct,Around 25%,Wrong,Around 35%,Very Wrong,Around 15%
-1643,How many people in low-income countries have access to some form of bank account?,en-US,Around 15%,Very Wrong,Around 25%,Wrong,Around 40%,Correct,Around 40%
-1645,"Around 25% of the economies of high-income countries come from the industry sector (including manufacturing and construction), on average. How much is this in low-income countries?",en-US,Around 25%,Correct,Around 45%,Wrong,Around 65%,Very Wrong,Around 25%
-1653,"Worldwide, how many workers have informal jobs that are not registered with the government (and are not in agriculture)?",en-US,Around 20%,Very Wrong,Around 40%,Wrong,Around 60%,Correct,Around 60%
-1681,What is the average income of small family farms in low-income countries?,en-US,About $1.50 a day,Correct,About $5 a day,Wrong,About $10 a day,Very Wrong,About $1.50 a day
-1727,"Of all money earned in the world, how much comes from work in industry, manufacturing and construction?",en-US,28%,Correct,38%,Wrong,53%,Very Wrong,28%
-1754,"What is the official method for updating the list of countries called ""Developing countries""?",en-US,GDP per capita threshold,Wrong,There is no procedure,Correct,Voting at the UN General Assembly,Very Wrong,There is no procedure
-61,Ports in high-income countries handled around 350 million containers in 2020. What was that number for low- and middle-income countries?,en-US,Less than 200 million,Very Wrong,Around 300 million,Wrong,More than 400 million,Correct,More than 400 million
-63,How many people in the world live within range of a 3G or higher quality mobile network?,en-US,Around 40%,Very Wrong,Around 65%,Wrong,Around 90%,Correct,Around 90%
-1581,"In 2021, cargo ships worldwide carried 11 billion tons. What share of the cargo was oil, petroleum products, gas and chemicals?",en-US,Around 30%,Correct,Around 40%,Wrong,Around 50%,Very Wrong,Around 30%
-1625,"In 2020, the manufacturing sector contributed how much to the world's economy?",en-US,Around 15%,Correct,Around 30%,Wrong,Around 45%,Very Wrong,Around 15%
-1685,"Globally, which region has the highest number of mobile money accounts?",en-US,East Asia and Pacific,Wrong,Sub-Saharan Africa,Correct,Europe and Central Asia,Very Wrong,Sub-Saharan Africa
-73,"Of 195 countries, how many have promised to create laws against racism, in accordance with the UN Convention on the Elimination of All Forms of Racial Discrimination?",en-US,10 countries,Very Wrong,80 countries,Wrong,180 countries,Correct,180 countries
-74,What share of the members of the International Monetary Fund (IMF) are low- and middle-income countries?,en-US,Around 20%,Very Wrong,Around 45%,Wrong,Around 70%,Correct,Around 70%
-75,"Since it was founded, the International Monetary Fund (IMF) has had 12 managing directors. How many of them were born in Europe?",en-US,4,Very Wrong,8,Wrong,12,Correct,12
-1690,What share of Europe's population are migrants?,en-US,Around 12%,Correct,Around 22%,Wrong,Around 32%,Very Wrong,Around 12%
-77,"How much of the world’s total land surface has some physical infrastructure built on it, like houses or roads (excluding farm land)?",en-US,Less than 5%,Correct,Around 15%,Wrong,More than 25%,Very Wrong,Less than 5%
-78,"In 2020, there were around 68 cities in Africa with more than 1 million inhabitants. How many cities of this size will there be in Africa in 2030, according to UN experts?",en-US,Fewer than 60,Very Wrong,Around 70,Wrong,More than 90,Correct,More than 90
-1582,How many megacities are there in the whole world (metropolitan areas with more than 10 million inhabitants)?,en-US,Around 12,Very Wrong,Around 23,Wrong,Around 35,Correct,Around 35
-1707,"If current trends continue, in 2100 the biggest city in the world is expected to be in:",en-US,Asia,Wrong,Africa,Correct,America,Very Wrong,Africa
-1709,"When listing the ten most expensive cities to live in the world in 2022, how many were in Europe?",en-US,4,Correct,6,Wrong,9,Very Wrong,4
-81,How many years did it take between the discovery of ozone depleting substances and the signing of an international agreement to ban them?,en-US,14 years,Correct,24 years,Wrong,34 years,Very Wrong,14 years
-83,"Globally, how much food is lost between being harvested and sold in stores?",en-US,Around 15%,Correct,Around 30%,Wrong,Around 45%,Very Wrong,Around 15%
-84,"Of all waste collected from homes, businesses and schools worldwide, how much of the total weight is plastic?",en-US,Around 12%,Correct,Around 42%,Wrong,Around 72%,Very Wrong,Around 12%
-1585,What share of the world’s waste is generated in North America?,en-US,Around 14%,Correct,Around 28%,Wrong,Around 42%,Very Wrong,Around 14%
-1586,"Compared to people in high-income countries, how much waste do people living in middle-income countries generate per person per day?",en-US,Double the amount of waste,Very Wrong,About the same,Wrong,Half the amount of waste,Correct,Half the amount of waste
-1628,"Globally, what share of waste that is collected from households, businesses and streets ends up in managed landfills (not open dump sites)?",en-US,Around 35%,Correct,Around 70%,Wrong,Around 95%,Very Wrong,Around 35%
-16,"After water, which raw material is most used in the world in terms of volume?",en-US,Oil,Wrong,Sand,Correct,Wood,Very Wrong,Sand
-1710,"What share of waste that is collected from households, businesses and streets is food and other green waste (e.g. grass cuttings)?",en-US,Around 45%,Correct,Around 55%,Wrong,Around 65%,Very Wrong,Around 45%
-1747,"Of all greenhouse gases emitted in the world in 2004, around 1% required some kind of carbon tax or fee to be paid. What was the number in 2022?",en-US,Around 3%,Very Wrong,Around 13%,Wrong,Around 23%,Correct,Around 23%
-1760,How much raw material does each person in high-income countries use every year compared to those in middle-income countries?,en-US,60% less,Very Wrong,About the same,Wrong,60% more,Correct,60% more
-1590,How does water vapor contribute to the greenhouse effect (which keeps the earth’s atmosphere warm)?,en-US,Water vapor does not contribute,Wrong,Water vapor contributes significantly,Correct,Water vapor is not a greenhouse gas,Very Wrong,Water vapor contributes significantly
-1592,"Globally, how much has the average sea level been rising per year over the past 100 years?",en-US,2mm per year (0.08 inches),Correct,20mm per year (0.8 inches),Wrong,200mm per year (8 inches),Very Wrong,2mm per year (0.08 inches)
-1623,"Currently, the average temperature on Earth is 15C. What would the average temperature on Earth be without greenhouse gases?",en-US,-18C,Correct,+6C,Wrong,+21C,Very Wrong,-18C
-1634,"70% of Europeans said they were planning to switch to a more environmentally friendly energy provider to fight climate change, in 2020. What was this number in the US?",en-US,24%,Very Wrong,44%,Wrong,64%,Correct,64%
-1639,Which of these gases is NOT a greenhouse gas?,en-US,Water vapor,Wrong,Nitrogen gas,Correct,Ozone,Very Wrong,Nitrogen gas
-1649,When did scientists first start to realize that human activities have the power to influence the climate?,en-US,Around 1900,Correct,Around 1950,Wrong,Around 1980,Very Wrong,Around 1900
-1728,"If we stopped emitting greenhouse gases today, for how long would the sea level continue to rise?",en-US,About 10 years,Very Wrong,About 100 years,Wrong,"Over 1,000 years",Correct,"Over 1,000 years"
-1749,"If we stopped all carbon emissions (CO2) today, roughly how long would it take before the CO2 concentration in the atmosphere is back to the levels back in 1750?",en-US,Less than 100 years,Very Wrong,About 500 years,Wrong,More than 1000 years,Correct,More than 1000 years
-82,How many of the world’s 250 richest companies describe climate change as a risk in their annual reports?,en-US,About 10%,Very Wrong,About 30%,Wrong,About 60%,Correct,About 60%
-1660,How many of the world’s 250 richest companies have set targets to cut their carbon emissions?,en-US,About 20%,Very Wrong,About 50%,Wrong,About 80%,Correct,About 80%
-1756,"The people in high income-income countries emit how much CO2 per person, on average, compared to the poorest 50% of the world population?",en-US,About the same,Very Wrong,20% more,Wrong,Five times more,Correct,Five times more
-1759,"Since 1850, what share of all carbon emissions have come from Europe and North America?",en-US,Around 50%,Correct,Around 75%,Wrong,Around 90%,Very Wrong,Around 50%
-1762,The UN’s scenarios for stopping global warming rely on technologies to capture and store carbon from the atmosphere. How many of these technologies are already affordable and used today?,en-US,Less than 1%,Correct,Around 15%,Wrong,Around 30%,Very Wrong,Less than 1%
-90,"Oil and gas were the marine sectors that made the most money in 2010. If trends continue as before, which sector will it be in 2030?",en-US,Oil and gas,Wrong,Tourism,Correct,Wave energy,Very Wrong,Tourism
-93,"How many countries have ratified the Law of the Sea, a UN convention introduced in 1982? (There are 195 countries.)",en-US,49 countries,Very Wrong,109 countries,Wrong,169 countries,Correct,169 countries
-1497,What share of all fish caught for food across the world comes from fish farming?,en-US,Roughly 10%,Very Wrong,Roughly 35%,Wrong,Roughly 55%,Correct,Roughly 55%
-1520,What happened to the total volume of fish caught in the wild every year during the past 20 years?,en-US,It decreased by roughly 30%,Wrong,It stayed about the same,Correct,It increased by roughly 30%,Very Wrong,It stayed about the same
-1712,"Before countries signed the High Seas Treaty in March 2023, around 1% of international oceans were protected. What number does the new agreement aim to achieve?",en-US,Around 5%,Very Wrong,Around 15%,Wrong,Around 30%,Correct,Around 30%
-98,"When counting the total body weight of all mammals in the world, roughly how much comes from wild mammals?",en-US,Around 5%,Correct,Around 25%,Wrong,Around 50%,Very Wrong,Around 5%
-99,What share of all agricultural land worldwide is used for feeding animals?,en-US,Roughly 25%,Very Wrong,Roughly 50%,Wrong,Roughly 80%,Correct,Roughly 80%
-100,What happened to the global forest area in the last 30 years?,en-US,Decreased around 50%,Very Wrong,Decreased around 30%,Wrong,Decreased around 10%,Correct,Decreased around 10%
-101,"Of all the area that was tropical rainforest 100 years ago worldwide, how much is still tropical rainforest today?",en-US,Less than 10%,Very Wrong,Around 20%,Wrong,More than 30%,Correct,More than 30%
-1529,"Worldwide, what happened to the size of the land areas declared as protected in the last 30 years?",en-US,Decreased by 58%,Very Wrong,Decreased by 8%,Wrong,Increased by 8%,Correct,Increased by 8%
-1512,"Globally, what has happened to the size of the land area used for agriculture over the last 50 years?",en-US,Increased by 50%,Wrong,Stayed about the same,Correct,Decreased by 50%,Very Wrong,Stayed about the same
-1583,How many species of animals and plants are confirmed by biologists to have gone extinct in the last 200 years?,en-US,Around 600,Correct,"Around 60,000",Wrong,"Around 600,000",Very Wrong,Around 600
-1596,How much of the oxygen that the Amazon forest produces do we humans breathe?,en-US,Less than 1%,Correct,30%,Wrong,60%,Very Wrong,Less than 1%
-1597,"Of all oxygen produced in a year, how much comes from trees?",en-US,Around 25%,Correct,Around 55%,Wrong,Around 85%,Very Wrong,Around 25%
-1598,"Of all the known species of birds worldwide, how many are endangered or threatened?",en-US,Around 15%,Correct,Around 35%,Wrong,Around 55%,Very Wrong,Around 15%
-1647,"What share of the total area burned by wildfires globally was in Africa, between 2001 and 2018?",en-US,Around 10%,Very Wrong,Around 40%,Wrong,Around 70%,Correct,Around 70%
-107,"In 2000, 21% of all countries told the UN they had an independent human rights institution. What is the share today?",en-US,23% of countries,Very Wrong,41% of countries,Wrong,62% of countries,Correct,62% of countries
-102,What happened to the number of pirate attacks on boats worldwide in the last 10 years?,en-US,It doubled,Very Wrong,It didn’t change much,Wrong,It halved,Correct,It halved
-1547,What share of all battle deaths in the past 10 years occurred in Africa?,en-US,Around 15%,Correct,Around 35%,Wrong,Around 55%,Very Wrong,Around 15%
-1599,"What share of the world's population is suspected, arrested or cautioned by the police or criminal justice systems each year?",en-US,Around 2%,Correct,Around 20%,Wrong,Around 40%,Very Wrong,Around 2%
-1610,"Across the world, what share of international migrants are women and girls?",en-US,Roughly 18%,Very Wrong,Roughly 33%,Wrong,Roughly 48%,Correct,Roughly 48%
-1621,The 15 countries where most people say they trust others are:,en-US,Low-income countries,Very Wrong,Middle-income countries,Wrong,High-income countries,Correct,High-income countries
-1629,Roughly 35 countries every year experienced some sort of violent conflict during the past 5 years. What was the average number during the 1950s?,en-US,Around 15 countries,Correct,Around 30 countries,Wrong,Around 60 countries,Very Wrong,Around 15 countries
-1635,"When the Open Skies Treaty (which allows members to conduct observation flights over each others' land) began in 2002, 34 countries were members. How many are there today?",en-US,32,Correct,42,Wrong,62,Very Wrong,32
-1618,"Between 2011 and 2022, how many times did the US and Russia share information with each other about their nuclear weapons?",en-US,Zero times,Very Wrong,25 times,Wrong,25 thousand times,Correct,25 thousand times
-1568,"Of all firearms in the world, what share is owned by civilians?",en-US,Roughly 25%,Very Wrong,Roughly 55%,Wrong,Roughly 85%,Correct,Roughly 85%
-1702,"Worldwide, how many homicide victims are men?",en-US,Around 40%,Very Wrong,Around 60%,Wrong,Around 80%,Correct,Around 80%
-1724,"Of all the journalists in prison globally at the end of 2017, around 20% were in Russia, China, Iran, Myanmar and Belarus combined. What was this figure in 2022?",en-US,Around 23%,Very Wrong,Around 38%,Wrong,Around 53%,Correct,Around 53%
-112,What share of the least developed countries in the world have foreign investment promotion agencies?,en-US,20%,Very Wrong,50%,Wrong,80%,Correct,80%
-110,"Of all education costs in the world, how much is paid by governments?",en-US,Around 40%,Very Wrong,Around 60%,Wrong,Around 80%,Correct,Around 80%
-108,"In high-income countries during the past 40 years, what happened to the top marginal income taxes that apply to the richest 0.1% of the population?",en-US,They were cut by one third,Correct,They stayed roughly the same,Wrong,They increased by one third,Very Wrong,They were cut by one third
-1624,"Compared to the total international aid from rich governments in 2022, how much money did migrants across the world transfer back home to their countries of origin?",en-US,Less than 10% of total aid,Very Wrong,Around 30% of total aid,Wrong,More than 200% of total aid,Correct,More than 200% of total aid
-1648,What share of the members of the World Trade Organization are low- and middle-income countries?,en-US,Around 25%,Very Wrong,Around 45%,Wrong,Around 65%,Correct,Around 65%
-1651,Which free trade area includes the most countries?,en-US,North Atlantic Free Trade Agreement (NAFTA),Very Wrong,African Continental Free Trade Area,Correct,The European Union,Wrong,African Continental Free Trade Area
-1687,The US has 4% of the world's population. What share of votes does it have when the World Bank decides on development loans to poorer countries?,en-US,16%,Correct,36%,Wrong,56%,Very Wrong,16%
-1705,Charities and governments across the world give money to the least developed countries. When comparing the total amounts given…,en-US,Charities give 20 times more,Very Wrong,Governments give 20 times more,Correct,They give roughly the same,Wrong,Governments give 20 times more
-1616,How much of the economy of low-income countries comes from money transferred from citizens living abroad?,en-US,Roughly 6%,Correct,Roughly 26%,Wrong,Roughly 46%,Very Wrong,Roughly 6%
-39,How many child refugees attended primary school in 2021?,en-US,Less than 20%,Very Wrong,Around 40%,Wrong,More than 60%,Correct,More than 60%
-1594,"At the end of 2022, the world population was roughly 8 billion people. How many were international refugees, asylum seekers or displaced abroad by humanitarian crises?",en-US,46 million (0.6%),Correct,480 million (6%),Wrong,720 million (9%),Very Wrong,46 million (0.6%)
-1662,"In 2018, what share of refugee-hosting countries allowed refugees the right to work?",en-US,Around 10%,Very Wrong,Around 30%,Wrong,Around 45%,Correct,Around 45%
-1670,"In 2022, around 10% of the world's population live in low-income countries. What share of the global refugee population live there?",en-US,Less than 20%,Correct,Around 35%,Wrong,More than 50%,Very Wrong,Less than 20%
-1673,"In 2000, around 20% of all refugees worldwide fled to a high-income country. What was the number in 2021 (right before the war in Ukraine)?",en-US,Around 20%,Correct,Around 30%,Wrong,Around 40%,Very Wrong,Around 20%
-1674,"In Uganda, Bangladesh, Colombia, and Türkiye, there were a total of 0.4 million refugees and migrants from humanitarian crises in 2010. What was the number in 2022?",en-US,Around 1 million,Very Wrong,Around 5 million,Wrong,Around 9 million,Correct,Around 9 million
-1675,"Of all Syrian refugees in Egypt, Lebanon, Jordan and Iraq in 2022, how many said they couldn't afford their basic needs, such as food, medicine and housing?",en-US,Around 30%,Very Wrong,Around 60%,Wrong,Around 90%,Correct,Around 90%
-1677,"In 1990, people and governments gave $1.3 billion to UNHCR to help refugees across the world. How much was given in 2022? (UNHCR is the United Nations Refugee Agency. The amounts are adjusted for inflation.)",en-US,Slightly less,Very Wrong,2 times more,Wrong,5 times more,Correct,5 times more
-1723,"Where do the majority of people who are forced to flee their homes due to conflict, persecution or disaster move to?",en-US,Neighboring countries,Wrong,Stay within their own country,Correct,High-income countries,Very Wrong,Stay within their own country
-1515,What is the life expectancy of the world population?,en-US,50 years,Very Wrong,60 years,Wrong,70 years,Correct,70 years
-1514,Where does the majority of the world population live?,en-US,Low-income countries,Wrong,Middle-income countries,Correct,High-income countries,Very Wrong,Middle-income countries
-1518,"Tigers, Giant Pandas and Mountain Gorillas were listed as threatened species in 1996. Since then, have any of these species become more critically endangered?",en-US,None of them,Correct,One of them,Wrong,Two of them,Very Wrong,None of them
-1519,"The global climate experts believe that, over the next 100 years, the average temperature will:",en-US,Get warmer,Correct,Remain the same,Wrong,Get colder,Very Wrong,Get warmer
-1601,"If all kinds of cancer could be cured, how much longer would lives be on average worldwide?",en-US,Around 3.5 years longer,Correct,Around 11.5 years longer,Wrong,Around 19.5 years longer,Very Wrong,Around 3.5 years longer
-1546,What share of all overweight children under 5 in the world live in Africa?,en-US,Around 5%,Very Wrong,Around 15%,Wrong,Around 25%,Correct,Around 25%
-1525,Eighty percent of all men in the world were in the labor force in 2019. What was the number for women?,en-US,52%,Correct,62%,Wrong,72%,Very Wrong,52%
-35,How many university students worldwide get their degree in their home country (as opposed to abroad)?,en-US,Around 77%,Very Wrong,Around 87%,Wrong,Around 97%,Correct,Around 97%
-36,"Of all primary school teachers in low-income countries, how many are trained?",en-US,30%,Very Wrong,50%,Wrong,70%,Correct,70%
-38,"Across the world, how many children go to some form of preschool the year before they start school?",en-US,Around 20%,Very Wrong,Around 40%,Wrong,Around 60%,Correct,Around 60%
-1575,"Of all children in the world who are of primary school age but don't go to school, how many are girls?",en-US,Around 55%,Correct,Around 65%,Wrong,Around 75%,Very Wrong,Around 55%
-1579,"Globally, how many primary school teachers are not trained for the job?",en-US,Around 15%,Correct,Around 35%,Wrong,Around 55%,Very Wrong,Around 15%
-1561,"In 1950, roughly 50% of all adults had at least some basic education. What is the share today?",en-US,Around 40%,Very Wrong,Around 60%,Wrong,Around 80%,Correct,Around 80%
-1612,How many primary schools in Sub-Saharan Africa have some access to electricity?,en-US,Around 10%,Very Wrong,Around 20%,Wrong,Around 30%,Correct,Around 30%
-1711,What share of university teachers in the world are women?,en-US,Around 20%,Very Wrong,Around 30%,Wrong,Around 40%,Correct,Around 40%
-34,"Worldwide, there are around 38 million people living with HIV. How many of them got anti-HIV drugs in 2021?",en-US,Less than 15%,Very Wrong,Roughly 30%,Wrong,More than 50%,Correct,More than 50%
-1573,In what share of all schools in the world do children have soap and water to wash their hands with?,en-US,Around 25%,Very Wrong,Around 55%,Correct,Around 85%,Wrong,Around 55%
-119,"In 1990, more than 4 million people died from indoor pollution. What has happened to the number who die each year since?",en-US,It decreased more than 30%,Correct,It stayed about the same,Wrong,It increased more than 30%,Very Wrong,It decreased more than 30%
-95,"In 1990, around 9% of children worldwide died before age five. What is the number today?",en-US,Around 4%,Correct,Around 9%,Wrong,Around 14%,Very Wrong,Around 4%
-94,"The average length of life worldwide is roughly 72, today. What was the global life expectancy a hundred years ago?",en-US,37 years,Correct,47 years,Wrong,57 years,Very Wrong,37 years
-1605,How many children in Europe (including Russia and Türkiye) receive the full two doses of the measles vaccine?,en-US,Around 30%,Very Wrong,Around 60%,Wrong,Around 90%,Correct,Around 90%
-1548,How many children aged under 5 in Sub-Saharan Africa sleep under a bed net to prevent malaria?,en-US,Around 25%,Very Wrong,Around 35%,Wrong,Around 50%,Correct,Around 50%
-113,"What share of the world’s population today live in countries where life expectancy is shorter than 50 years (in 1960, it was around 55%)?",en-US,Less than 1%,Correct,Around 30%,Wrong,Around 60%,Very Wrong,Less than 1%
-1641,What is the average life expectancy in Sub-Saharan Africa?,en-US,Around 40 years,Very Wrong,Around 50 years,Wrong,Around 60 years,Correct,Around 60 years
-1655,"Worldwide, what share of all deaths are directly caused by illegal drug use?",en-US,Less than 1%,Correct,Around 10%,Wrong,More than 20%,Very Wrong,Less than 1%
-1656,"Of all pregnant women in the world, how many give birth without first being examined by an educated midwife, nurse or doctor?",en-US,Around 15%,Correct,Around 35%,Wrong,Around 55%,Very Wrong,Around 15%
-1683,What share of all road fatalities globally occur in low- and middle-income countries?,en-US,Around 50%,Very Wrong,Around 70%,Wrong,Around 90%,Correct,Around 90%
-1560,How many babies in the world are born in health facilities?,en-US,Around 40%,Very Wrong,Around 60%,Wrong,Around 80%,Correct,Around 80%
-1688,What share of adults worldwide used drugs in 2020 (other than alcohol or medicine)?,en-US,Around 5%,Correct,Around 25%,Wrong,Around 50%,Very Wrong,Around 5%
-1570,Which of these risk factors leads to more deaths?,en-US,Dirty water,Very Wrong,Polluted air,Correct,Contaminated food,Wrong,Polluted air
-1703,How many countries (out of 194) participate in some of the World Health Organization’s programmes for child vaccination?,en-US,64,Very Wrong,114,Wrong,194,Correct,194
-30,The low-income countries of today had a life expectancy of 44 years back in 1970. What is it now?,en-US,40 years,Very Wrong,50 years,Wrong,60 years,Correct,60 years
-1686,"Governments sometimes sell land to foreign companies. Since 2009, the total reported number of hectares sold annually worldwide has:",en-US,Dropped to less than half,Correct,Stayed about the same,Wrong,Doubled,Very Wrong,Dropped to less than half
-1725,"Globally, a person with less than $2.15 a day is said to be living in extreme poverty. In the 40 richest countries, the national poverty lines are roughly:",en-US,$5 a day,Very Wrong,$10 a day,Wrong,$20 a day,Correct,$20 a day
-1556,"Worldwide, how many people living in rural areas have some access to electricity?",en-US,Less than 30%,Very Wrong,About 50%,Wrong,More than 70%,Correct,More than 70%
-1530,"Of all people in the world living in extreme poverty, with less than $2 a day, how many live in middle-income countries?",en-US,Around 15%,Very Wrong,Around 30%,Wrong,Around 60%,Correct,Around 60%
-24,How many people in the world cook using stoves that don't produce smoke?,en-US,Around 20%,Very Wrong,Around 45%,Wrong,Around 70%,Correct,Around 70%
-22,How many countries (of 195) have some form of social security benefits for people with disabilities?,en-US,46,Very Wrong,116,Wrong,186,Correct,186
-1527,"During the last 70 years, there were roughly 135 armed conflicts between sovereign countries. What was the number in the 70 years before that?",en-US,Fewer than 80,Very Wrong,Around 130,Wrong,More than 180,Correct,More than 180
-1744,"In 1990, around 17% of all electricity produced worldwide came from nuclear energy. What is that number today?",en-US,10%,Correct,25%,Wrong,40%,Very Wrong,10%
-1743,"In the year 2000, more than 20% of the world's population did not have any access to electricity. What is the share today?",en-US,Around 10%,Correct,Around 20%,Wrong,Around 30%,Very Wrong,Around 10%
-1742,"In the poorest 25 countries, how many more children die among the poorest 20% of the population, compared to the richest 20%?",en-US,Around 5 times more,Correct,Around 30 times more,Wrong,Around 60 times more,Very Wrong,Around 5 times more
-1739,"In 2010, 1.1 billion people did not have access to electricity. What has happened to this number since then?",en-US,It decreased by 30%,Correct,It stayed the same,Wrong,It increased by 30%,Very Wrong,It decreased by 30%
-1554,How many companies globally are partially owned by women?,en-US,Around 5%,Very Wrong,Around 15%,Wrong,Around 30%,Correct,Around 30%
-1698,"In the European Union, for every 100,000 babies that are born, around six women die during pregnancy, while giving birth or shortly after. What is this figure in the United States?",en-US,About the same,Very Wrong,Double,Wrong,Over three times more,Correct,Over three times more
-1704,NATO members committed to spend a minimum share of their countries’ total incomes (GDP) each year on defense. What is that figure?,en-US,2%,Correct,8%,Wrong,22%,Very Wrong,2%
-1700,Graphite is used in electric-vehicle batteries and dysprosium is used in electric-vehicle motors. What share of those materials is processed in China?,en-US,Around 25%,Very Wrong,Around 65%,Wrong,Close to 100%,Correct,Close to 100%
-1714,Child mortality in Latin America today is at the same level it was in the United States in what year?,en-US,1900,Very Wrong,1940,Wrong,1980,Correct,1980
-1713,"Between 1990 and 2022 the average income in the U.S increased 60%, adjusted for inflation. What happened to the average income in Latin America?",en-US,Decreased 15%,Very Wrong,Increased 5%,Wrong,Increased 55%,Correct,Increased 55%
-1588,How many of the world's 250 largest companies (by revenue) are now reporting on sustainability each year?,en-US,Around 35%,Very Wrong,Around 65%,Wrong,Around 95%,Correct,Around 95%
-1671,"In the next 30 years, UN experts expect the world’s urban population to grow to 6.7 billion people. What do they think will happen to the rural population in that period?",en-US,It will decrease around 10%,Correct,It will stay about the same,Wrong,It will increase around 10%,Very Wrong,It will decrease around 10%
-1701,Which of the following options cause most deaths in Latin America and the Caribbean?,en-US,Drugs and Violence,Very Wrong,Heart Disease & Strokes,Correct,Diabetes and Undernourishment,Wrong,Heart Disease & Strokes
-1614,"In the year 2000, the exports from high-income countries were worth 4 times more than those from all other countries combined. What was that figure in 2022?",en-US,High-income export around 2 times more,Correct,High-income export around 20 times more,Wrong,High-income export around 40 times more,Very Wrong,High-income export around 2 times more
-1679,"Worldwide, between 2000 and 2010, there were 140 journalists in prison each year on average. How many were there in 2022?",en-US,About the same,Very Wrong,50% more,Wrong,150% more,Correct,150% more
-1615,What share of people in low-income countries live in extreme poverty (with less than $2 a day)?,en-US,Around 50%,Correct,Around 70%,Wrong,Around 90%,Very Wrong,Around 50%
-1602,What happened to the suicide rate in the European Union between 1999 and 2019?,en-US,Decreased by 25%,Correct,Stayed about the same,Wrong,Increased by 25%,Very Wrong,Decreased by 25%
-1609,Which of these represent the largest flow of money to Africa?,en-US,Direct Investment by foreign companies,Very Wrong,Development Aid from foreign governments,Wrong,Remittance transfers from Africans abroad,Correct,Remittance transfers from Africans abroad
-1606,How many 15-year-old boys drink alcohol weekly in Europe (including Russia and Türkiye)?,en-US,Around 15%,Correct,Around 30%,Wrong,Around 45%,Very Wrong,Around 15%
-1540,"In 2019, how many people worldwide died from infections that were resistant to antibiotics?",en-US,Around 10 thousand deaths,Very Wrong,Around 100 thousand deaths,Wrong,More than 1 million deaths,Correct,More than 1 million deaths
-1572,"Of all children born in low-income countries in 1990, about 18% died before their 5th birthday. What is the number today?",en-US,Less than 10%,Correct,Around 20%,Wrong,More than 30%,Very Wrong,Less than 10%
-1569,"Worldwide, how many children under 5 are dangerously underweight because of lack of food or serious illness?",en-US,Around 8%,Correct,Around 28%,Wrong,Around 48%,Very Wrong,Around 8%
-1564,"In the year 1800, 98% of all energy consumption came from traditional biofuels (such as burning wood and agricultural waste biomass). What is the share today?",en-US,Less than 10%,Correct,Around 25%,Wrong,More than 50%,Very Wrong,Less than 10%
-1565,"During the past 120 years, roughly 20% of all violent activist campaigns worldwide succeeded in achieving some regime change. What share of non-violent campaigns succeed?",en-US,Around 5%,Very Wrong,Around 25%,Wrong,Around 50%,Correct,Around 50%
-1566,"Of all money used by governments in rich countries, how much is given as international aid to poorer countries?",en-US,Around 0.5%,Correct,Around 2.5%,Wrong,Around 6.5%,Very Wrong,Around 0.5%
-1559,"Between 2013 and 2020, the amount of methamphetamine (an illegal drug) that was seized by customs worldwide…",en-US,Remained about the same,Very Wrong,Almost doubled,Wrong,More than tripled,Correct,More than tripled
-1557,In what share of all countries is homosexuality legal?,en-US,Around 25%,Very Wrong,Around 45%,Wrong,Around 65%,Correct,Around 65%
-1508,"Globally, how many people suffer from depression, according to the World Health Organization?",en-US,Around 5%,Correct,Around 25%,Wrong,Around 45%,Very Wrong,Around 5%
-1555,"Roughly how much is the minimum wage for a day’s work across the USA, Canada, Australia, New Zealand and the European Union, on average?",en-US,Around $15 a day,Very Wrong,Around $30 a day,Wrong,Around $60 a day,Correct,Around $60 a day
-1551,"In 1970, 30% of all energy used in the world came from burning coal. What happened to that share since then?",en-US,It almost halved,Wrong,It stayed about the same,Correct,It almost doubled,Very Wrong,It stayed about the same
-1552,42% of all university teachers in high-income countries are women. What is the share in middle-income countries?,en-US,15%,Very Wrong,30%,Wrong,45%,Correct,45%
-1549,"Worldwide, 22% of children are much shorter than average because they’ve had too little food sometime during their first five years of life. What is that number in Sub-Saharan Africa?",en-US,32%,Correct,52%,Wrong,72%,Very Wrong,32%
-1544,What share of the population in towns and cities in Sub-Saharan Africa have some access to electricity?,en-US,Around 25%,Very Wrong,Around 50%,Wrong,Around 75%,Correct,Around 75%
-1541,"What would it cost to install clean water and sanitation in all healthcare facilities in the 46 poorest countries, expressed as % of total international aid in 2020:",en-US,Around 5%,Correct,Around 55%,Wrong,Around 105%,Very Wrong,Around 5%
-1539,"In low-income countries, around 45% of all deaths are caused by infections. What is the number in high-income countries?",en-US,Around 5%,Correct,Around 15%,Wrong,Around 35%,Very Wrong,Around 5%
diff --git a/automation-api/yival_experiments/data/questions_zh-CN.csv b/automation-api/yival_experiments/data/questions_zh-CN.csv
deleted file mode 100644
index 08b4d48..0000000
--- a/automation-api/yival_experiments/data/questions_zh-CN.csv
+++ /dev/null
@@ -1,285 +0,0 @@
-question_id,question_text,language,option_a,option_a_correctness,option_b,option_b_correctness,option_c,option_c_correctness,correct_answer
-40,在世界范围内，妇女在国家议会中所占的席位比例是多少？,zh-CN,大约 5%,Very Wrong,大约 15%,Wrong,大约 25%,Correct,大约 25%
-60,世界上有多少人拥有移动电话（和电话号码）？,zh-CN,接近 70%,Correct,接近 80%,Wrong,接近 90%,Very Wrong,接近 70%
-1,全球自杀率在近 20 年里有什么变化？,zh-CN,减少了大约 25%,Correct,基本没有变化,Wrong,增加了大约 25%,Very Wrong,减少了大约 25%
-4,在高收入国家（如德国和美国），生活在极端贫困（每天的收入低于 2 美元）中的人口占总人口的比例为多少？,zh-CN,少于 1%,Correct,大约 11％,Wrong,大约 21％,Very Wrong,少于 1%
-59,过去 40 年间，已知储量中剩余的石油和天然气数量：,zh-CN,减少到一半以下,Very Wrong,大致保持不变,Wrong,增加一倍以上,Correct,增加一倍以上
-1741,在人类活动排放的所有温室气体中，有多少来自燃烧化石燃料？,zh-CN,大约 25％,Very Wrong,大约 45％,Wrong,大约 65％,Correct,大约 65％
-1570,以下哪种风险因素会导致更多死亡？,zh-CN,不干净的水源,Very Wrong,受污染的空气,Correct,受污染的食物,Wrong,受污染的空气
-1737,在高收入国家中，有多少人认为气候变化会在未来 20 年内对他们的国家构成威胁？,zh-CN,大约 20％,Very Wrong,大约 50%,Wrong,大约 80％,Correct,大约 80％
-1691,在全世界使用的所有能源中，电力占多大比例？,zh-CN,20%左右,Correct,50%左右,Wrong,80%左右,Very Wrong,20%左右
-1731,2022 年，全世界增加的发电量中有多少来自可再生能源？,zh-CN,大约 10％,Very Wrong,大约 45％,Wrong,大约 85％,Correct,大约 85％
-1636,与前一年相比，由于新冠病毒大流行，2020 年全球二氧化碳排放量减少了多少？,zh-CN,减少 5% 左右,Correct,减少 20% 左右,Wrong,减少 40% 左右,Very Wrong,减少 5% 左右
-85,如果我们现在把每年的二氧化碳净排放量减半，全球平均温度会发生什么变化？,zh-CN,会降低,Very Wrong,会保持不变,Wrong,会持续上升,Correct,会持续上升
-72,1990 年，世界上有 3%的人口生活在与他们出生地不同的国家。今天这个比例是多少？,zh-CN,4%,Correct,14%,Wrong,24%,Very Wrong,4%
-1620,旅游业在全球经济总量中占多大比重？,zh-CN,10%左右,Correct,25%左右,Wrong,40%左右,Very Wrong,10%左右
-11,1990 年，世界人口的 58% 生活在低收入国家。 在今天，这个比例是多少？,zh-CN,约 9%,Correct,约 37％,Wrong,约 61％,Very Wrong,约 9%
-1603,到 2050 年，哪个地区的老年人（65 岁以上）数量将不会翻番？,zh-CN,低收入国家,Very Wrong,中等收入国家,Wrong,高收入国家,Correct,高收入国家
-1706,电子和制药公司用于研发的费用约占其收入的 12%。能源和燃料生产公司花费多少用于研发？,zh-CN,大约 0.3％,Correct,大约 3％,Wrong,大约 30％,Very Wrong,大约 0.3％
-1528,全世界有多少人的食物、水、厕所、电力、学校教育和医疗保健等基本需求得到满足？,zh-CN,20% 左右,Very Wrong,50% 左右,Wrong,80% 左右,Correct,80% 左右
-5,全球暖化带来的多余热量中有多少被海洋吸收了？,zh-CN,約百分之 10,Very Wrong,約百分之 50,Wrong,約百分之 90,Correct,約百分之 90
-76,当下，居住在城镇的人口是 1950 年的 5 倍多。农村人口发生了什么变化？,zh-CN,接近减半,Very Wrong,几乎没有变化,Wrong,接近翻倍,Correct,接近翻倍
-1520,在过去 20 年里，每年在野外捕获的鱼类总量发生了什么变化？,zh-CN,降低了大约 30%,Wrong,保持不变,Correct,上升了大约 30%,Very Wrong,保持不变
-20_text,现在，世界上大概有八十亿人。哪个选项正确展示出当今的人口的分布？,zh-CN,美洲 10 亿，欧洲 10 亿，非洲 20 亿，亚洲 40 亿,Wrong,美洲 10 亿，欧洲 10 亿，非洲 10 亿，亚洲 50 亿,Correct,美洲 20 亿，欧洲 10 亿，非洲 10 亿，亚洲 40 亿,Very Wrong,美洲 10 亿，欧洲 10 亿，非洲 10 亿，亚洲 50 亿
-1758,在全世界赚到的所有钱中，有多少来自服务业，如行政、银行、护理、教学、交通和娱乐？,zh-CN,大约 30％,Very Wrong,大约 50%,Wrong,超过 60%,Correct,超过 60%
-9,世界上使用的所有能源，有多少是来自于天然气、煤炭和石油？,zh-CN,约 42％,Very Wrong,约 62％,Wrong,约 82％,Correct,约 82％
-1771,2000 年至 2020 年间，全球孕产妇死亡率（母亲在分娩时或分娩后不久死亡的比率）发生了什么变化？,zh-CN,下降了约 30%,Correct,下降了约 10%,Wrong,大致保持不变,Very Wrong,下降了约 30%
-1770,1986 年，拥有核武器的国家储存了大约 64000 枚弹头。今天这个数字是多少？,zh-CN,少 80%,Correct,差不多,Wrong,多 50%,Very Wrong,少 80%
-1768,2010 年至 2021 年间，有多少个国家的谋杀率有所下降？(共有 195 个国家）。,zh-CN,小于 15 个,Very Wrong,大约 50 个,Wrong,超过 85 个,Correct,超过 85 个
-1767,与 30 年前相比，当今世界海洋中的微塑料污染大致为：,zh-CN,过去的 8 倍,Correct,过去的 80 倍,Wrong,过去的 800 倍,Very Wrong,过去的 8 倍
-1766,自 1960 年以来，种植马铃薯、木薯、玉米、水稻和小麦的田地的平均粮食产量发生了什么变化？,zh-CN,减少到一半以下,Very Wrong,基本保持不变,Wrong,差不多翻倍,Correct,差不多翻倍
-1765,与 2000 年相比，当前有多少动物、植物和真菌物种在野外得到了评估并被赋予保护地位？,zh-CN,多 7%,Very Wrong,多 70%,Wrong,多 700%,Correct,多 700%
-1762,联合国阻止全球变暖的方案依赖于从大气中捕捉和储存碳的技术。其中有多少技术是我们现在已经可以负担得起和使用的呢？,zh-CN,少于 1%,Correct,大约 15％,Wrong,大约 30％,Very Wrong,少于 1%
-1761,每年，太阳的能量（光和热）都会到达地球。与这些能量相比，每年有多少能量离开地球进入太空？,zh-CN,比到达的少 30%,Wrong,比到达的少 0.3%,Correct,比到达的多 30%,Very Wrong,比到达的少 0.3%
-1760,高收入国家每人每年使用的原材料与中等收入国家相比怎样？,zh-CN,少 60%,Very Wrong,差不多,Wrong,多 60%,Correct,多 60%
-1759,自 1850 年以来，欧洲和北美的碳排放量占总排放量的比例是多少？,zh-CN,大约 50%,Correct,大约 75％,Wrong,大约 90％,Very Wrong,大约 50%
-1757,1900 年，全世界约有 40%的儿童在 5 岁前死亡。今天这个数字是多少？,zh-CN,大约 4%,Correct,大约 14%,Wrong,大约 24％,Very Wrong,大约 4%
-1756,与世界上最贫穷的 50%人口相比，高收入国家的人口平均每人排放的二氧化碳量怎样？,zh-CN,差不多,Very Wrong,多 20%,Wrong,多 5 倍,Correct,多 5 倍
-1755,全世界 6 至 11 岁的女孩中，有多少人上学？,zh-CN,少于 30%,Very Wrong,大约 60%,Wrong,大约 90％,Correct,大约 90％
-1753,1950 年，有多少国家的奴隶仍然合法？（也就是没有法律禁止对他人的拥有权）,zh-CN,58,Very Wrong,98,Wrong,138,Correct,138
-1749,如果我们今天停止所有碳排放（二氧化碳），大概需要多长时间大气中的二氧化碳浓度才能恢复到 1750 年的水平？,zh-CN,少于 100 年,Very Wrong,大约 500 年,Wrong,大于 1000 年,Correct,大于 1000 年
-1747,在 2004 年全球排放的所有温室气体中，约有 1%需要缴纳某种碳税或碳费。2022 年的数字是多少？,zh-CN,大约 3％,Very Wrong,大约 13％,Wrong,大约 23％,Correct,大约 23％
-1738,从 2015 年到 2021 年，世界人口每年增加 8000 万。在同一时期，能够用上电的人口每年增加多少？,zh-CN,约 1500 万,Very Wrong,约 4000 万,Wrong,约 1.2 亿,Correct,约 1.2 亿
-1730,2021 年，在高收入国家，较贫穷的一半人口每人因燃烧化石燃料排放了大约 5 吨二氧化碳。而最富有的 10%人口每人排放了多少？,zh-CN,是较穷人口的一半,Very Wrong,是较穷人口的 2 倍,Wrong,是较穷人口的 8 倍,Correct,是较穷人口的 8 倍
-1729_text,如果我们将世界人口分为三个收入组别，低于 2 美元/天的组别，介于 2 美元/天和 24 美元/天之间的组别，以及高于 24 美元/天的组别。哪个选项最能体现 2022 年各群体的规模？,zh-CN,"50% 少于 2 美元/天, 20% 在 2 美元/天 和 24 美元/天之间, 30% 大于 24 美元/天",Very Wrong,"30% 少于 2 美元/天, 40% 在 2 美元/天 和 24 美元/天之间, 30% 大于 24 美元/天",Wrong,"10% 少于 2 美元/天, 70% 在 2 美元/天 和 24 美元/天之间, 20% 大于 24 美元/天",Correct,"10% 少于 2 美元/天, 70% 在 2 美元/天 和 24 美元/天之间, 20% 大于 24 美元/天"
-1728,如果我们今天停止排放温室气体，海平面还会继续上升多久？,zh-CN,大约 10 年,Very Wrong,大约 100 年,Wrong,超过 1000 年,Correct,超过 1000 年
-1727,在全世界赚到的所有钱中，有多少来自工业、制造业和建筑业？,zh-CN,28%,Correct,38%,Wrong,53%,Very Wrong,28%
-1725,在全球范围内，每天生活费不足 2.15 美元的人被称为生活在赤贫之中。在 40 个最富裕的国家中，国家贫困线大致是：,zh-CN,每天 5 美元,Very Wrong,每天 10 美元,Wrong,每天 20 美元,Correct,每天 20 美元
-1724,2017 年底，在全球所有被关押的记者中，被关在俄罗斯、中国、伊朗、缅甸和白俄罗斯加起来约占 20%。2022 年这个数字是多少？,zh-CN,大约 23％,Very Wrong,大约 38％,Wrong,大约 53％,Correct,大约 53％
-1723,因冲突、迫害或灾难而被迫逃离家园的人大多数迁往何处？,zh-CN,邻国,Wrong,留在自己的国家,Correct,高收入国家,Very Wrong,留在自己的国家
-1722,自 1987 年达成限制消耗臭氧层物质的协议以来，已经有多少消耗臭氧层物质被淘汰了？,zh-CN,少于 40%,Very Wrong,大约 70％,Wrong,超过 95%,Correct,超过 95%
-1720,在全世界范围内，有多少农村地区的人能够在离家 30 分钟的路程内获得安全饮用水？,zh-CN,少于 25%,Very Wrong,大约 50%,Wrong,超过 75%,Correct,超过 75%
-1719,在所有国家中，有多大比例的国家制定了某种保护妇女免受家庭暴力的法律？,zh-CN,大约 25％,Very Wrong,大约 50%,Wrong,大约 75％,Correct,大约 75％
-1717,人均收入与印度、摩洛哥和玻利维亚相近的国家，人均二氧化碳排放量和高收入国家相比怎样？,zh-CN,少 80%,Correct,少 40%,Wrong,多 20%,Very Wrong,少 80%
-1716,在这些自由贸易区中，哪个包括的国家最多？,zh-CN,北美自由贸易协议,Very Wrong,非洲大陆自由贸易区,Correct,欧盟,Wrong,非洲大陆自由贸易区
-1711,全世界大学教师中女性占多大比例？,zh-CN,大约 20%,Very Wrong,大约 30%,Wrong,大约 40%,Correct,大约 40%
-1710,从家庭、企业和街道收集的废物中，食物和其他绿色废物（如割下来的草）所占的比例是多少？,zh-CN,大约 45％,Correct,大约 55%,Wrong,大约 65％,Very Wrong,大约 45％
-1709,2022 年全球生活成本最高的十个城市有多少个在欧洲？,zh-CN,4,Correct,6,Wrong,9,Very Wrong,4
-1708,在 2012 年和 2022 年之间，被恐怖主义者杀害而死亡的人数发生了什么变化？,zh-CN,减少 25%,Correct,差不多,Wrong,增加 25%,Very Wrong,减少 25%
-1705,世界各地的慈善机构和政府都会向最不发达国家提供资金。比较两者的捐赠总额：,zh-CN,慈善机构的捐赠多出 20 倍,Very Wrong,政府的捐赠多出 20 倍,Correct,两者的捐赠数量大致相同,Wrong,政府的捐赠多出 20 倍
-1703,在 194 个国家中，有多少个国家参加了世界卫生组织的某些儿童疫苗接种计划？,zh-CN,64,Very Wrong,114,Wrong,194,Correct,194
-1702,全世界有多少凶杀案的受害者是男性？,zh-CN,大约 40％,Very Wrong,大约 60％,Wrong,大约 80％,Correct,大约 80％
-1692,与 100 年前相比，今天美国的电费有多便宜？,zh-CN,差不多,Very Wrong,便宜 10 倍,Wrong,便宜 200 倍,Correct,便宜 200 倍
-1690,移民人口在欧洲人口中占多大比例？,zh-CN,大约 12%,Correct,大约 22％,Wrong,大约 32％,Very Wrong,大约 12%
-1689,在 2000 年全球使用的所有能源中，约 86% 来自燃烧化石燃料（石油、煤炭和天然气）。今天这个数字是多少？,zh-CN,大约 52％,Very Wrong,大约 64％,Wrong,大约 82％,Correct,大约 82％
-1688,2020 年全球使用毒品（酒精或药物除外）的成年人比例是多少？,zh-CN,大约 5％,Correct,大约 25%,Wrong,大约 50%,Very Wrong,大约 5％
-1687,美国人口占世界人口的 4%。在世界银行决定向贫穷国家提供发展贷款时，美国拥有多少投票权？,zh-CN,16%,Correct,36%,Wrong,56%,Very Wrong,16%
-1686,政府有时会向外国公司出售土地。自 2009 年以来，全世界每年报告出售的土地总公顷数有什么变化？,zh-CN,减少到一半以下,Correct,大致保持不变,Wrong,加倍,Very Wrong,减少到一半以下
-1685,在全球范围内，哪个地区的移动支付账户数量最多？,zh-CN,东亚和太平洋地区,Wrong,撒哈拉以南非洲,Correct,欧洲和中亚,Very Wrong,撒哈拉以南非洲
-1683,在全球所有道路死亡事故中，发生在中低收入国家的占多大比例？,zh-CN,大约 50%,Very Wrong,大约 70％,Wrong,大约 90％,Correct,大约 90％
-1682,自 1970 年以来，全球人均能源消耗量发生了什么变化？,zh-CN,基本保持不变,Wrong,上升了大约 50%,Correct,上升了大约 250%,Very Wrong,上升了大约 50%
-1681,在低收入国家，小型家庭农场的平均收入是多少？,zh-CN,大约每天 1.50 美元,Correct,大约每天 5 美元,Wrong,大约每天 10 美元,Very Wrong,大约每天 1.50 美元
-1671,联合国专家预计，未来 30 年，全球城市人口将增至 67 亿。他们认为农村人口会在这个时期里发生什么变化？,zh-CN,下降大约 10%,Correct,基本保持不变,Wrong,增长大约 10%,Very Wrong,下降大约 10%
-1670,2022 年，全球约有 10%的人口生活在低收入国家。全球难民人口中有多大比例生活在那些国家？,zh-CN,少于 20%,Correct,35%左右,Wrong,超过 50%,Very Wrong,少于 20%
-1669,在 195 个国家中，有多少国家禁止使用含铅汽油？,zh-CN,75 个国家,Very Wrong,135 个国家,Wrong,195 个国家,Correct,195 个国家
-1663,2019 年在欧盟地区就业的受过大学教育的难民中，有多少人的受教育程度比职业要求的高？,zh-CN,大约 20％,Very Wrong,大约 40％,Wrong,大约 60％,Correct,大约 60％
-1662,2018 年，有多大比例的难民收容国允许难民享有工作权？,zh-CN,大约 10％,Very Wrong,大约 30％,Wrong,大约 50%,Correct,大约 50%
-1661,目前有多少难民生活在世界上最大的难民营中？,zh-CN,"大约 9,000",Very Wrong,"大约 90,000",Wrong,"大约 900,000",Correct,"大约 900,000"
-1660,在全球最富有的 250 家公司中，有多少家已经制定了减少碳排放的目标？,zh-CN,大约 20%,Very Wrong,大约 50%,Wrong,大约 80%,Correct,大约 80%
-1568,在全世界所有枪支中，平民拥有的比例是多少？,zh-CN,大约 25%,Very Wrong,大约 55%,Wrong,大约 85%,Correct,大约 85%
-1567,联合国专家预计，从现在到 2050 年，世界 195 个国家中有多少国家的人口会减少？,zh-CN,10 个国家左右,Very Wrong,30 个国家左右,Wrong,50 个国家左右,Correct,50 个国家左右
-1562,全世界有多少人口可以使用与下水道相连的厕所？,zh-CN,大约 40％,Correct,大约 55%,Wrong,大约 70％,Very Wrong,大约 40％
-1560,世界上有多少婴儿是在医疗机构出生的？,zh-CN,大约 40%,Very Wrong,大约 60%,Wrong,大约 80％,Correct,大约 80％
-1556,在全球范围内，有多少生活在农村地区的人能够用上一些电？,zh-CN,少于 30%,Very Wrong,大约 50%,Wrong,超过 70%,Correct,超过 70%
-1553,在所有 25-29 岁的男性中，约有 90% 处于劳动力大军中（有工作或正在找工作）。女性的这一数字是多少？,zh-CN,大约 60％,Correct,大约 70％,Wrong,大约 80％,Very Wrong,大约 60％
-1509,世界上有多少成年人说自己识字？,zh-CN,小于 40%,Very Wrong,大约 60%,Wrong,超过 80%,Correct,超过 80%
-113,今世界有多大比例的人口生活在预期寿命短于 50 岁的国家（1960 年约为 55%）？,zh-CN,少于 1%,Correct,大约 30%,Wrong,大约 60%,Very Wrong,少于 1%
-21,1980 年，世界上大约 40%的人口生活在极端贫困中，每天的生活费不足 2 美元。今天这个比例是多少？,zh-CN,10%,Correct,30%,Wrong,50%,Very Wrong,10%
-37,全世界有多少 15 岁以下的儿童的阅读和数学能力没有达到最低技能要求？,zh-CN,20%,Very Wrong,40%,Wrong,60%,Correct,60%
-1500,在全球范围内，女性从事的无偿护理和家务劳动多于男性。多多少？,zh-CN,多 50%,Very Wrong,一倍左右,Wrong,三倍左右,Correct,三倍左右
-97,哪类动物的濒危物种比例最高？,zh-CN,鸟类,Wrong,哺乳动物,Very Wrong,两栖动物,Correct,两栖动物
-1499,20 世纪 50 年代，50% 的战争发生在被联合国承认为主权国家的国家之间。今天这个数字是多少？,zh-CN,5% 左右,Correct,25% 左右,Wrong,55% 左右,Very Wrong,5% 左右
-1502,联合国可持续发展目标共有 169 项具体目标。其中有多少项提到了“民主”？,zh-CN,0 项目标,Correct,14 项目标,Wrong,34 项目标,Very Wrong,0 项目标
-64,2022 年全球使用互联网的人口比例是多少？,zh-CN,大约 60%,Correct,大约 75%,Wrong,大约 95%,Very Wrong,大约 60%
-1517,现在的非洲儿童死亡人数与以下哪年的欧洲儿童死亡人数持平：,zh-CN,1850,Very Wrong,1900,Wrong,1950,Correct,1950
-13,世界上有多少人口居住在特大城市（至少有 1000 万人口的城市）？,zh-CN,约 8％,Correct,约 28%,Wrong,约 48%,Very Wrong,约 8％
-105,中收入和低收入国家在联合国大会成员国中占多大比例？,zh-CN,20% 左右,Very Wrong,45% 左右,Wrong,70% 左右,Correct,70% 左右
-50,世界上有多少人能用上电？,zh-CN,小于 20%,Very Wrong,大约 50%,Wrong,大于 80%,Correct,大于 80%
-12,世界上有多少人口没有足够的食物来满足他们的日常需要？,zh-CN,约 11％,Correct,约 23％,Wrong,约 37％,Very Wrong,约 11％
-82,全球 250 家最富有的公司中，有多少家公司在其年度报告中将气候变化描述为一项风险？,zh-CN,约 10%,Very Wrong,约 30%,Wrong,约 60%,Correct,约 60%
-81,从发现消耗臭氧层的物质到签署禁止使用这些物质的国际协定用了多少年？,zh-CN,14 年,Correct,24 年,Wrong,34 年,Very Wrong,14 年
-71,如果以最富有的 10% 的人在所有收入中所占比例来衡量，世界上哪个地区的收入不平等程度最小？,zh-CN,中东,Very Wrong,欧洲,Correct,撒哈拉以南非洲,Wrong,欧洲
-55,与 1980 年相比，现今太阳能电池板的能源成本大约是：,zh-CN,1980 年成本的 1%,Correct,1980 年成本的 21%,Wrong,1980 年成本的 41%,Very Wrong,1980 年成本的 1%
-52,1990 年，世界上有 19% 的 65 岁及以上老年人仍在工作。预计 2030 年这一比例会是多少？,zh-CN,大约 21%,Correct,大约 41%,Wrong,大约 61%,Very Wrong,大约 21%
-46,在世界范围内，有多少人没有任何形式的厕所，而不得不使用灌木丛、田野或街道？,zh-CN,大约 10%,Correct,大约 30%,Wrong,大约 50%,Very Wrong,大约 10%
-33,2021 年全球有多少 1 岁儿童接种了某种疾病的疫苗？,zh-CN,20% 以下,Very Wrong,约 50%,Wrong,80% 以上,Correct,80% 以上
-31,全世界有多少婴儿是在接受过训练的保健工作者在场的情况下出生的？,zh-CN,小于 30%,Very Wrong,大约 50%,Wrong,大于 80%,Correct,大于 80%
-27,全世界有多少个国家在基因库中的保存了植物遗传材料？,zh-CN,小于 10 个,Very Wrong,大约 50 个,Wrong,大约 100 个,Correct,大约 100 个
-1764,自 2005 年以来，欧盟和美国的二氧化碳排放量发生了什么变化？,zh-CN,都下降了 20%,Correct,都大致保持不变,Wrong,都增加了 20%,Very Wrong,都下降了 20%
-1763,在全世界 195 个国家中，有多少国家的妇女没有选举权？,zh-CN,0,Correct,23,Wrong,46,Very Wrong,0
-1748,在撒哈拉以南非洲地区，有多少女孩在 15 岁之前结婚？,zh-CN,大约 10％,Correct,大约 30％,Wrong,大约 50%,Very Wrong,大约 10％
-1726,从 2001 年到今天，有多少国家从低收入国家晋升为中等收入国家？,zh-CN,小于 5,Very Wrong,大约 15,Wrong,大于 30,Correct,大于 30
-1715,温室气体有什么作用？,zh-CN,污染大气,Very Wrong,吸收热量,Correct,反射光和热量,Wrong,吸收热量
-1712,在 2023 年 3 月各国签署《公海条约》之前，约有 1%的国际海洋受到保护。新协议的目标是达到多少？,zh-CN,大约 5％,Very Wrong,大约 15％,Wrong,大约 30％,Correct,大约 30％
-1707,如果目前的趋势继续下去，预计到 2100 年，世界上最大的城市将会在：,zh-CN,亚洲,Wrong,非洲,Correct,美洲,Very Wrong,非洲
-17,世界上有多少国家制定了禁止工作场所性骚扰的法律？,zh-CN,大约 30%,Very Wrong,大约 50%,Wrong,大约 70%,Correct,大约 70%
-1680,有多少个国家拥有环绕地球轨道的卫星？,zh-CN,约 15 个国家,Very Wrong,大约 40 个国家,Wrong,约 80 个国家,Correct,约 80 个国家
-1653,在全球范围内，有多少工人从事未在政府登记的非正规工作（农业除外）？,zh-CN,约 20%,Very Wrong,约 40%,Wrong,约 60%,Correct,约 60%
-1600,在全世界所有记录在案的凶杀案中，有多大比例与帮派和有组织犯罪有关？,zh-CN,大约 20%,Correct,大约 40%,Wrong,大约 60%,Very Wrong,大约 20%
-1594,2022 年末，世界人口达到约 80 亿。有多少人会因为人道主义危机变成国际难民、寻求庇护者或者流落到其他国家和地区？,zh-CN,4600 万(0.6%),Correct,4.8 亿(6%),Wrong,7.2 亿(9%),Very Wrong,4600 万(0.6%)
-1512,在全球范围内，过去 50 年里用于农业的土地面积发生了什么变化？,zh-CN,增加了 50%,Wrong,大致保持不变,Correct,减少了 50%,Very Wrong,大致保持不变
-1511,非洲人吃的食物有多少产自非洲？,zh-CN,少于 20%,Very Wrong,约 50%,Wrong,超过 80%,Correct,超过 80%
-98,如果计算世界上所有哺乳动物的总重量，大约有多少来自野生哺乳动物？,zh-CN,大约 5%,Correct,大约 25%,Wrong,大约 50%,Very Wrong,大约 5%
-95,1990 年，全世界约有 9% 的儿童在五岁前死亡。现今这个数字是多少？,zh-CN,大约 4%,Correct,大约 9%,Wrong,大约 14%,Very Wrong,大约 4%
-93,在 195 个国家中，有多少国家批准了 1982 年出台的联合国公约《海洋法》？,zh-CN,45 个国家,Very Wrong,105 个国家,Wrong,165 个国家,Correct,165 个国家
-89,自 20 世纪 70 年代以来，全球每年油轮漏油的次数发生了什么变化？,zh-CN,减少到十分之一,Correct,几乎保持不变,Wrong,增加到 10 倍,Very Wrong,减少到十分之一
-43,在 195 个国家中，有多少个国家未满 18 岁可以合法结婚？,zh-CN,19,Very Wrong,79,Wrong,139,Correct,139
-3,在世界上的低收入国家里，有百分之多少的女孩（在疫情前）上学到至少十一岁？,zh-CN,大约 20％,Very Wrong,大约 40％,Wrong,大约 60％,Correct,大约 60％
-24,世界上有多少人使用不产生烟雾的炉灶做饭？,zh-CN,大约 20%,Very Wrong,大约 40%,Wrong,大约 60%,Correct,大约 60%
-2,在过去的一百年里，每年死于自然灾害的人数有什么变化？,zh-CN,增加一倍以上,Very Wrong,基本没有变化,Wrong,减少到一半以下,Correct,减少到一半以下
-18,世界经济中有多少来自农业、林业和渔业？,zh-CN,大约 5%,Correct,大约 25%,Wrong,大约 45%,Very Wrong,大约 5%
-1750,2000 年，全球约 50%的人口没有现代炉灶。他们只能靠烧木柴、木炭或粪便来烹饪食物。如今这一比例是多少？,zh-CN,大约 30％,Correct,大约 40%,Wrong,大约 50%,Very Wrong,大约 30％
-1650,有多少国家的最高政治领导人是女性？,zh-CN,4 个左右,Very Wrong,12 个左右,Wrong,19 个左右,Correct,19 个左右
-1611,当今世界有多少人口生活在中等收入国家？,zh-CN,25%左右,Very Wrong,50%左右,Wrong,75%左右,Correct,75%左右
-1510,全世界 6 至 11 岁的儿童中，有多少人上学？,zh-CN,小于 25%,Very Wrong,大约 60%,Wrong,超过 85%,Correct,超过 85%
-1504,世界人口中有多少生活在平均每个女性生育不到 3 个孩子的国家？,zh-CN,40%,Very Wrong,60%,Wrong,80%,Correct,80%
-118,在过去 20 年中，生活在极端贫困中的人口比例...,zh-CN,减少一半以上,Correct,保持差不多不变,Wrong,接近翻倍,Very Wrong,减少一半以上
-109,2000 年，中收入和低收入国家的贷款支出平均占其年收入的近 6%。2020 年这一数字是多少？,zh-CN,大约 4%,Correct,大约 8%,Wrong,大约 12%,Very Wrong,大约 4%
-102,过去 10 年全球海盗袭击船只的次数发生了什么变化？,zh-CN,翻倍了,Very Wrong,没有大的变化,Wrong,减半了,Correct,减半了
-62,全球所有工人中制造业工人的占比是多少？,zh-CN,约 15％,Correct,约 35％,Wrong,约 55％,Very Wrong,约 15％
-70,在计算世界上的难民人数时，这个数字包括了那些……,zh-CN,逃往其他国家的人,Correct,逃往祖国其他地方的人,Very Wrong,以上两者都计算在内,Wrong,逃往其他国家的人
-69,2022 年 12 月，在全球所有难民中，流亡时间超过四年的难民占多大比例？,zh-CN,小于 30%,Very Wrong,大约 45%,Wrong,超过 60%,Correct,超过 60%
-66,世界上所有移民中，难民占多大比例？,zh-CN,大约 15%,Correct,大约 25%,Wrong,大约 35%,Very Wrong,大约 15%
-65,在这些国家中，哪个国家收容的难民占其人口的比例最大？,zh-CN,德国,Wrong,黎巴嫩,Correct,瑞典,Very Wrong,黎巴嫩
-80,自 1990 年以来，欧洲约有 200 万人因自然灾害而暂时无家可归。同期亚洲的数字是……,zh-CN,与欧洲大致相同,Very Wrong,欧洲的 5 倍,Wrong,欧洲的 50 倍,Correct,欧洲的 50 倍
-1524,与 2000 年相比，2022 年全世界使用了多少化石燃料（石油、煤炭和天然气）？,zh-CN,95%（减少了一点）,Very Wrong,120%（增加了一点）,Wrong,145%（增加很多）,Correct,145%（增加很多）
-86,在联合国承认的 195 个国家中，有多少国家采纳了关于气候变化和环境的《巴黎协定》？,zh-CN,92 个,Very Wrong,142 个,Wrong,192 个,Correct,192 个
-1672,1990 年，全球约有 60% 的电力是用化石燃料生产的。今天这个数字是多少？,zh-CN,40%左右,Very Wrong,50%左右,Wrong,60%左右,Correct,60%左右
-23,在高收入国家的所有 65 岁或以上人口中，有多少人生活在贫困线以下？,zh-CN,14%,Correct,29%,Wrong,44%,Very Wrong,14%
-57,全世界有多少超过退休年龄的人有权领取养老金？,zh-CN,大约 25%,Very Wrong,大约 45%,Wrong,大约 75%,Correct,大约 75%
-1678,2022 年，许多西欧国家政府为帮助新抵达的难民，为每个人每天花费超过 20 美元。 联合国在世界其他地方为每个难民平均每天花费多少钱？,zh-CN,大约 1 美元,Correct,大约 25 美元,Wrong,大约 45 美元,Very Wrong,大约 1 美元
-96,2000 年至 2017 年间，全球新增结核病例数发生了什么变化？,zh-CN,增加了 20%,Very Wrong,几乎保持不变,Wrong,降低了 20%,Correct,降低了 20%
-1640,在全球范围内，每天收入低于 2 美元被视为赤贫。北欧国家（瑞典、挪威、丹麦、芬兰、冰岛）的国家贫困线大致如下：,zh-CN,每天 10 美元,Very Wrong,每天 20 美元,Wrong,每天 30 美元,Correct,每天 30 美元
-1676,2022 年，联合国花费了大约 64 亿美元来帮助全世界的难民。西欧各国政府为帮助西欧境内的难民花费了多少钱？,zh-CN,不到 30 亿美元,Very Wrong,大约 50 亿美元,Wrong,超过 200 亿美元,Correct,超过 200 亿美元
-51,1950 年，0% 的能源消耗来自核能。今天这个数字是多少？,zh-CN,大约 5%,Correct,大约 25%,Wrong,大约 45%,Very Wrong,大约 5%
-8,生物学家已经评估了超过 15 万种动植物的状况。有多少属于濒危或受威胁物种？,zh-CN,大约 30％,Correct,大约 60％,Wrong,大约 90％,Very Wrong,大约 30％
-120,今年全世界预计会有多少天花病例？,zh-CN,零病例,Correct,十万病例,Wrong,一百万病例,Very Wrong,零病例
-103,世界上有多少人说他们对当地警察有信心？,zh-CN,小于 15%,Very Wrong,25% 左右,Wrong,超过 50%,Correct,超过 50%
-104,在过去 5 年中，哪一组国家派出的联合国维和部队最多？,zh-CN,德国，瑞典，荷兰，爱尔兰,Wrong,埃塞俄比亚，卢旺达，孟加拉国，印度，尼泊尔,Correct,法国，美国，日本，韩国，瑞士，英国,Very Wrong,埃塞俄比亚，卢旺达，孟加拉国，印度，尼泊尔
-106,世界上有多少人在自己居住的地方独自走夜路会感到安全？,zh-CN,小于 30%,Very Wrong,大约 45%,Wrong,超过 60%,Correct,超过 60%
-42,截至 1990 年，世界上有 18 个国家由女性国家元首或政府首脑领导。今天这个数字是多少？,zh-CN,36,Very Wrong,56,Wrong,86,Correct,86
-41,在世界范围内，妇女使用最多的避孕方法是哪一种？,zh-CN,避孕药,Very Wrong,绝育,Correct,宫内节育器（IUD）,Wrong,绝育
-91,全球每年平均食用 6 公斤牛肉和小牛肉。平均每人消费多少鱼肉？,zh-CN,大约 3kg,Very Wrong,大约 6kg,Wrong,大约 10kg,Correct,大约 10kg
-92,自 2016 年以来，世界各国水域中海洋保护区的比例发生了什么变化？,zh-CN,减少了 75%,Very Wrong,几乎保持不变,Wrong,增加了 75%,Correct,增加了 75%
-1633,70% 的欧洲人表示，他们计划在 2020 年改用更环保的能源供应商，以应对气候变化。有多少中国人计划这样做？,zh-CN,34%,Very Wrong,64%,Wrong,94%,Correct,94%
-1580,在全世界所有儿童（5-17 岁）中，有多少人是受剥削的童工？,zh-CN,大约 10%,Correct,大约 30%,Wrong,大约 50%,Very Wrong,大约 10%
-1523,在印度、中国、巴基斯坦、越南、柬埔寨、缅甸和印度尼西亚这些拥有大量纺织厂的国家，最低工资大致是多少？,zh-CN,大约一天 0.2 美元,Very Wrong,大约一天 1 美元,Wrong,大约一天 10 美元,Correct,大约一天 10 美元
-1646,低收入国家的所有收入中有多少来自农业、林业和渔业？,zh-CN,25%左右,Correct,50%左右,Wrong,75%左右,Very Wrong,25%左右
-44,在 195 个国家中，有多少国家签署了联合国《消除对妇女一切形式歧视公约》？,zh-CN,50,Very Wrong,120,Wrong,190,Correct,190
-1574,在 195 个国家中，有多少个国家的已婚男性和女性不平等地享有拥有土地或房屋的法律权利？,zh-CN,约 20 个国家,Correct,约 80 个国家,Wrong,约 140 个国家,Very Wrong,约 20 个国家
-1619,在撒哈拉以南非洲地区，有多少女性在 18 岁之前结婚？,zh-CN,30%左右,Correct,50%左右,Wrong,70%左右,Very Wrong,30%左右
-1652,有多少国家的法律规定男女同工同酬？,zh-CN,10%左右,Very Wrong,25%左右,Wrong,40%左右,Correct,40%左右
-45,全世界有多少生活在农村地区的人使用地表水（如湖泊、河流和溪流）作为饮用水？,zh-CN,小于 10%,Correct,大约 30%,Wrong,大于 60%,Very Wrong,小于 10%
-47,在全世界使用的所有淡水中，有多少用于农业？,zh-CN,大约 30%,Very Wrong,大约 50%,Wrong,大约 70%,Correct,大约 70%
-48,在 195 个国家中，有多少个国家至少有一座海水淡化厂？,zh-CN,30,Very Wrong,100,Wrong,180,Correct,180
-49,有多少国家制定了要求在规划和管理淡水资源时需要当地社区参与的规则？,zh-CN,30%,Very Wrong,50%,Wrong,70%,Correct,70%
-1576,在世界范围内，有多少生活在农村地区的人拥有厕所并且不用与其他家庭共用？,zh-CN,大约 20%,Very Wrong,大约 40%,Wrong,大约 60%,Correct,大约 60%
-1577,世界上有多少人在家里有肥皂和水洗手？,zh-CN,大约 20%,Very Wrong,大约 40%,Wrong,大约 70%,Correct,大约 70%
-1626,全世界有多少人需要花费超过 30 分钟的往返时间来获取饮用水？,zh-CN,10%左右,Correct,25%左右,Wrong,50%左右,Very Wrong,10%左右
-54,在当今世界使用的所有可再生能源中，传统的生物质燃烧（如木炭、木材和农业废物）占据了多少比例？,zh-CN,大约 10%,Very Wrong,大约 25%,Wrong,大约 40%,Correct,大约 40%
-1637,全世界使用的所有能源中有多少来自现代可再生能源（太阳能、风能、水能和现代生物燃料）？,zh-CN,小于 10%,Correct,20%左右,Wrong,35%左右,Very Wrong,小于 10%
-1638,在全球电力总成本中，从发电站到用户的平均传输成本是多少？,zh-CN,小于 1%,Very Wrong,Around 15%,Wrong,30%左右,Correct,30%左右
-53,全球约有 1.6 亿儿童被雇为童工。他们大多在哪个部门工作？,zh-CN,工业,Very Wrong,农业,Correct,服务业,Wrong,农业
-1522,自 1970 年以来，40 个最富有国家的平均收入增长了一倍多。其他国家的平均收入怎样？,zh-CN,减少到一半,Very Wrong,大致保持不变,Wrong,增加 1 倍多,Correct,增加 1 倍多
-1578,在全世界范围内，非正规就业的年轻人（15-24 岁）占所有在工作的年轻人的比例是多少？,zh-CN,大约 25%,Very Wrong,大约 50%,Wrong,大约 75%,Correct,大约 75%
-1587,世界上有多少男青年（15-24 岁）没有上学、就业或培训？,zh-CN,大约 15%,Correct,大约 25%,Wrong,大约 35%,Very Wrong,大约 15%
-1643,低收入国家有多少人拥有某种形式的银行账户？,zh-CN,15%左右,Very Wrong,25%左右,Wrong,40%左右,Correct,40%左右
-1645,高收入国家平均约 25% 的经济产出来自工业部门（包括制造业和建筑业）。低收入国家的这一比例是多少？,zh-CN,大约 25%,Correct,大约 45%,Wrong,大约 65%,Very Wrong,大约 25%
-61,2020 年，高收入国家的港口处理了约 3.5 亿个集装箱。中收入和低收入国家的这一数字是多少？,zh-CN,不到 2 亿,Very Wrong,大约 3 亿,Wrong,超过 4 亿,Correct,超过 4 亿
-1581,2021 年，全球货轮运载了 110 亿吨货物。其中石油、石油产品、天然气和化学品占多大比例？,zh-CN,大约 30%,Correct,大约 40%,Wrong,大约 50%,Very Wrong,大约 30%
-1625,2020 年，制造业对世界经济的贡献有多大？,zh-CN,15%左右,Correct,30%左右,Wrong,45%左右,Very Wrong,15%左右
-73,在 195 个国家中，有多少国家承诺根据联合国《消除一切形式种族歧视公约》制定反对种族主义的法律？,zh-CN,10 个国家,Very Wrong,80 个国家,Wrong,180 个国家,Correct,180 个国家
-74,国际货币基金组织（IMF）成员中，中收入和低收入国家占多大比例？,zh-CN,大约 20%,Very Wrong,大约 45%,Wrong,大约 70%,Correct,大约 70%
-75,国际货币基金组织（IMF）自成立以来，已经有 12 位总裁。其中有多少人出生在欧洲？,zh-CN,4,Very Wrong,8,Wrong,12,Correct,12
-77,在世界陆地总面积中，有多少面积建有房屋或道路等有形基础设施（不包括农田）？,zh-CN,低于 5%,Correct,大约 15%,Wrong,超过 25%,Very Wrong,低于 5%
-78,2020 年，非洲约有 68 座城市的居民人数超过 100 万。根据联合国专家的预测，到 2030 年，非洲将有多少这样规模的城市？,zh-CN,小于 60 个,Very Wrong,大约 70 个,Wrong,超过 90 个,Correct,超过 90 个
-79,特大城市是指居民超过 1000 万的城市。预计到 2030 年，居住在特大城市的人口将占世界人口的多大比例？,zh-CN,大约 9%,Correct,大约 39%,Wrong,大约 69%,Very Wrong,大约 9%
-1582,全世界有多少特大城市（都市区人口超过 1 000 万）？,zh-CN,约 12 个,Very Wrong,约 23 个,Wrong,约 35 个,Correct,约 35 个
-83,在全球范围内，粮食从收获到商店销售之间损失了多少？,zh-CN,大约 15%,Correct,大约 30%,Wrong,大约 45%,Very Wrong,大约 15%
-84,在全球从家庭、企业和学校收集的所有垃圾中，塑料占总重量的多少？,zh-CN,大约 12%,Correct,大约 42%,Wrong,大约 72%,Very Wrong,大约 12%
-1585,北美产生的垃圾占世界垃圾总量的多少？,zh-CN,大约 14%,Correct,大约 28%,Wrong,大约 42%,Very Wrong,大约 14%
-1586,与高收入国家的人相比，生活在中等收入国家的人每人每天产生多少废物？,zh-CN,双倍数量的废物,Very Wrong,差不多,Wrong,一半数量的废物,Correct,一半数量的废物
-1628,在全球范围内，从家庭、企业和街道收集的垃圾中，有多大比例最终被送往有管理的垃圾填埋场（非露天垃圾场）？,zh-CN,35%左右,Correct,70%左右,Wrong,95%左右,Very Wrong,35%左右
-16,除水之外，世界上使用量最大的原材料是什么？,zh-CN,石油,Wrong,沙子,Correct,木材,Very Wrong,沙子
-7,世界上有多少人生活在海拔 5 米或更低的地区？,zh-CN,大约 11%,Correct,大约 31%,Wrong,大约 51%,Very Wrong,大约 11%
-1590,水蒸气如何加剧温室效应（使地球大气层保持温暖）？,zh-CN,水蒸气不产生影响,Wrong,水蒸气的作用很大,Correct,水蒸气不是温室气体,Very Wrong,水蒸气的作用很大
-1592,从全球来看，过去 100 年中平均海平面每年上升多少？,zh-CN,每年 2 毫米（0.08 英寸）,Correct,每年 20 毫米（0.8 英寸）,Wrong,每年 200 毫米（8 英寸）,Very Wrong,每年 2 毫米（0.08 英寸）
-1623,目前，地球的平均气温为 15℃。如果没有温室气体，地球的平均温度会是多少？,zh-CN,-18 度,Correct,+6 度,Wrong,+21 度,Very Wrong,-18 度
-1634,2020 年，70% 的欧洲人表示他们计划改用更环保的能源供应商，以应对气候变化。有多少美国人计划这样做？,zh-CN,24%,Very Wrong,44%,Wrong,64%,Correct,64%
-1639,以下哪种气体不是温室气体？,zh-CN,水蒸气,Wrong,氮气,Correct,臭氧,Very Wrong,氮气
-1649,科学家是什么时候开始意识到人类活动能够影响气候的？,zh-CN,1900 年前后,Correct,1950 年前后,Wrong,1980 年前后,Very Wrong,1900 年前后
-90,石油和天然气是 2010 年赚钱最多的海洋行业。如果趋势一如既往，到 2030 年赚钱最多的将会是哪个行业？,zh-CN,石油和天然气,Wrong,旅游业,Correct,波浪能,Very Wrong,旅游业
-1497,全世界用于食用的鱼类中有多少来自养鱼业？,zh-CN,大约 10%,Very Wrong,大约 35%,Wrong,大约 55%,Correct,大约 55%
-1521,当生物学家在 1950 年开始统计海洋中的鱼类数量时，大约 1%的鱼类资源被过度开发。到 2019 年，这个比例是…,zh-CN,…5%左右,Very Wrong,…20%左右,Wrong,…超过 35%,Correct,…超过 35%
-99,全世界用于饲养动物的农业用地占总农业用地的多大比例？,zh-CN,大约 25%,Very Wrong,大约 50%,Wrong,大约 80%,Correct,大约 80%
-100,过去 30 年间，全球森林面积发生了什么变化？,zh-CN,减少 50% 左右,Very Wrong,减少 30% 左右,Wrong,减少 10% 左右,Correct,减少 10% 左右
-101,全世界 100 年前的热带雨林，到现在仍是热带雨林的面积有多少？,zh-CN,小于 10%,Very Wrong,20% 左右,Wrong,大于 30%,Correct,大于 30%
-1529,在过去的 30 年里，全球被宣布为受保护的土地面积发生了什么变化？,zh-CN,减少了 58%,Very Wrong,减少了 8%,Wrong,增加了 8%,Correct,增加了 8%
-1583,有多少动植物物种在过去 200 年里被生物学家确认灭绝了？,zh-CN,约 600 个,Correct,"约 60,000 个",Wrong,"约 600,000 个",Very Wrong,约 600 个
-1596,亚马逊森林产生的氧气中，有多少是我们人类呼吸的？?,zh-CN,小于 1%,Correct,30%,Wrong,60%,Very Wrong,小于 1%
-1597,在一年产生的所有氧气中，有多少来自树木？,zh-CN,大约 25%,Correct,大约 55%,Wrong,大约 85%,Very Wrong,大约 25%
-1598,在全世界已知的鸟类物种中，有多少属于濒危或受威胁物种？,zh-CN,大约 15%,Correct,大约 35%,Wrong,大约 55%,Very Wrong,大约 15%
-1647,2001 年至 2018 年间，全球野火烧毁的总面积中，非洲所占的比例是多少？,zh-CN,10%左右,Very Wrong,40%左右,Wrong,70%左右,Correct,70%左右
-107,2000 年，21%的国家告诉联合国它们有独立的人权机构。今天的比例是多少？,zh-CN,23% 的国家,Very Wrong,41% 的国家,Wrong,62% 的国家,Correct,62% 的国家
-1547,在过去 10 年中，在非洲的战死的人数占全球所有战死人数的比例是多少？,zh-CN,大约 15%,Correct,大约 35%,Wrong,大约 55%,Very Wrong,大约 15%
-1599,世界人口中每年被警方或刑事司法系统怀疑、逮捕或警告的比例是多少？,zh-CN,大约 2%,Correct,大约 20%,Wrong,大约 40%,Very Wrong,大约 2%
-1610,在全世界，妇女和女童在国际移民中所占比例是多少？,zh-CN,大约 18%,Very Wrong,大约 33%,Wrong,大约 48%,Correct,大约 48%
-1621,大多数人表示信任他人的 15 个国家都是：,zh-CN,低收入国家,Very Wrong,中等收入国家,Wrong,高收入国家,Correct,高收入国家
-1629,在过去 5 年中，每年大约有 35 个国家发生某种形式的暴力冲突。20 世纪 50 年代的平均数字是多少？,zh-CN,15 个国家左右,Correct,30 个国家左右,Wrong,60 个国家左右,Very Wrong,15 个国家左右
-1635,2002 年《开放天空条约》（允许成员国在对方领土上空进行观测飞行）开始生效时，有 34 个国家加入。今天有多少个国家？,zh-CN,32,Correct,42,Wrong,62,Very Wrong,32
-1618,从 2011 年到 2022 年，美国和俄罗斯相互交流了多少次核武器信息？,zh-CN,0 次,Very Wrong,25 次,Wrong,2.5 万次,Correct,2.5 万次
-19,高收入国家政府有多少收入来自关税和进口税？,zh-CN,大约 2%,Correct,大约 12%,Wrong,大约 22%,Very Wrong,大约 2%
-112,世界上有多少最不发达国家设有外国投资促进机构？,zh-CN,20%,Very Wrong,50%,Wrong,80%,Correct,80%
-111,在最不发达国家获得的所有财政援助中，有多少来自慈善和公益事业？,zh-CN,大约 5%,Correct,大约 25%,Wrong,大约 50%,Very Wrong,大约 5%
-110,在全世界所有教育费用中，有多少是由政府支付的？,zh-CN,大约 40%,Very Wrong,大约 60%,Wrong,大约 80%,Correct,大约 80%
-108,在过去 40 年里，高收入国家对最富有的 0.1%人口征收的最高边际所得税发生了什么变化？,zh-CN,削减了三分之一,Correct,大致保持不变,Wrong,增加了三分之一,Very Wrong,削减了三分之一
-1624,与 2022 年从富有的政府提供的国际援助总额相比，全球移民向原籍国转移了多少钱？,zh-CN,不到援助总额的 10%,Very Wrong,援助总额的 30%左右,Wrong,超过援助总额的 200%,Correct,超过援助总额的 200%
-1642,与德国和美国等高收入国家相比，联合国儿童基金会在 2018 年购买儿童疫苗时，平均成本价怎样？,zh-CN,两倍以上的价格,Very Wrong,大约一半的价格,Wrong,低于 20%的价格,Correct,低于 20%的价格
-1648,世界贸易组织成员中，中收入和低收入国家占多大比例？,zh-CN,25%左右,Very Wrong,45%左右,Wrong,65%左右,Correct,65%左右
-1651,哪个自由贸易区包括的国家最多？,zh-CN,北大西洋自由贸易协定（NAFTA）,Very Wrong,非洲大陆自由贸易区,Correct,欧盟,Wrong,非洲大陆自由贸易区
-1616,低收入国家的经济有多少来自人们从国外寄回家的钱？,zh-CN,大约 6%,Correct,大约 26%,Wrong,大约 46%,Very Wrong,大约 6%
-1505,根据联合国的预测，到 2100 年，世界人口将再增加 24 亿。主要原因是将有更多的……,zh-CN,儿童（15 岁以下）,Very Wrong,成年人（15 到 74 岁）,Correct,老龄人口（75 岁以上）,Wrong,成年人（15 到 74 岁）
-1506,当今世界有 20 亿 0-14 岁的儿童。根据联合国的数据，2100 年将有多少儿童？,zh-CN,40 亿,Very Wrong,30 亿,Wrong,20 亿,Correct,20 亿
-1593,预计到 2050 年，全球老年人（65 岁以上）总数将增加 8 亿。其中高收入国家老年人将占多少比例？,zh-CN,15%,Correct,35%,Wrong,55%,Very Wrong,15%
-56,当下，全世界 65% 的人口处于工作年龄（15-64 岁）。联合国专家认为 2100 年这一数字会是多少？,zh-CN,50%,Wrong,60%,Correct,70%,Very Wrong,60%
-1622,目前地球上有 80 亿人口。联合国专家认为 2100 年会有多少人？,zh-CN,约 105 亿,Correct,约 165 亿,Wrong,约 225 亿,Very Wrong,约 105 亿
-1627,如果当前的人口趋势继续下去，到 2050 年，所有国家中有多少比例的国家的人口将少于今天？,zh-CN,5%,Very Wrong,15%,Wrong,25%,Correct,25%
-1631,三十年前，世界上只有不到 25%的人口生活在中等收入国家。今天的比例是多少？,zh-CN,15%左右,Very Wrong,45%左右,Wrong,75%左右,Correct,75%左右
-1654,世界人口从 60 亿增长到 70 亿用了 12 年。从 70 亿增加到 80 亿用了多长时间？,zh-CN,2 年,Very Wrong,7 年,Wrong,12 年,Correct,12 年
-39,2021 年有多少难民儿童上小学？,zh-CN,小于 20%,Very Wrong,大约 40%,Wrong,大于 60%,Correct,大于 60%
-1673,2000 年，全球约有 20% 的难民逃往高收入国家。2021 年（乌克兰战争前夕）的数字是多少？,zh-CN,20%左右,Correct,30%左右,Wrong,40%左右,Very Wrong,20%左右
-1674,2010 年，乌干达、孟加拉国、哥伦比亚和土耳其共有 40 万由人道主义危机造成的难民和移民。2022 年的数字是多少？,zh-CN,约 100 万,Very Wrong,约 500 万,Wrong,约 900 万,Correct,约 900 万
-1675,2022 年，在埃及、黎巴嫩、约旦和伊拉克的所有叙利亚难民中，有多少人表示他们无法负担食物、药品和住房等基本需求？,zh-CN,30%左右,Very Wrong,60%左右,Wrong,90%左右,Correct,90%左右
-1677,1990 年，各国人民和政府向联合国难民署（UNHCR）捐款 13 亿美元，用于帮助世界各地的难民。2022 年捐赠了多少？（金额已根据通货膨胀率进行调整）,zh-CN,略少一点,Very Wrong,多 2 倍,Wrong,多 5 倍,Correct,多 5 倍
-1525,2019 年，全球 80%的男性在劳动力市场中。女性的比例是多少？,zh-CN,52%,Correct,62%,Wrong,72%,Very Wrong,52%
-35,全世界有多少大学生在自己的本国（而不是国外）获得学位？,zh-CN,大约 77%,Very Wrong,大约 87%,Wrong,大约 97%,Correct,大约 97%
-36,在低收入国家的所有小学教师中，有多少接受过培训？,zh-CN,30%,Very Wrong,50%,Wrong,70%,Correct,70%
-38,全世界有多少儿童在入学前一年上过某种形式的学前班？,zh-CN,大约 20%,Very Wrong,大约 40%,Wrong,大约 60%,Correct,大约 60%
-1575,全世界所有到了上小学年龄却没有上学的儿童中，有多少是女孩？,zh-CN,大约 55%,Correct,大约 65%,Wrong,大约 75%,Very Wrong,大约 55%
-1579,全球有多少小学教师没有接受过相关培训？,zh-CN,大约 15%,Correct,大约 35%,Wrong,大约 55%,Very Wrong,大约 15%
-1561,1950 年，大约 50% 的成年人至少接受过一些基础教育。今天的比例是多少？,zh-CN,大约 40%,Very Wrong,大约 60%,Wrong,大约 80%,Correct,大约 80%
-1612,在撒哈拉以南非洲地区，有多少小学能用上一些电？,zh-CN,10%左右,Very Wrong,20%左右,Wrong,30%左右,Correct,30%左右
-34,全球约有 3800 万艾滋病病毒感染者。其中有多少人在 2021 年获得了抗艾滋病病毒药物？,zh-CN,15% 以下,Very Wrong,大约 30%,Wrong,50% 以上,Correct,50% 以上
-1573,全世界所有学校中，有多少比例的学校为孩子们提供肥皂和水洗手？,zh-CN,25%左右,Very Wrong,55%左右,Correct,85%左右,Wrong,55%左右
-119,1990 年，超过 400 万人死于室内污染。从那时起，这个数字发生了什么变化？,zh-CN,降低了超过30%,Correct,大致保持不变,Wrong,增加了超过30%,Very Wrong,降低了超过30%
-94,当下，全球平均寿命约为 72 岁。一百年前的全球预期寿命是多少？,zh-CN,37 岁,Correct,47 岁,Wrong,57 岁,Very Wrong,37 岁
-1605,欧洲（包括俄罗斯和土耳其）有多少儿童全程接种了两剂麻疹疫苗？,zh-CN,大约 30%,Very Wrong,大约 60%,Wrong,大约 90%,Correct,大约 90%
-1548,在撒哈拉以南非洲地区，有多少 5 岁以下儿童睡在蚊帐里以预防疟疾？,zh-CN,大约 25%,Very Wrong,大约 35%,Wrong,大约 50%,Correct,大约 50%
-1641,撒哈拉以南非洲的平均预期寿命是多少？,zh-CN,40 岁左右,Very Wrong,50 岁左右,Wrong,60 岁左右,Correct,60 岁左右
-1655,在全球范围内，非法使用毒品直接导致的死亡人数占总死亡人数的比例是多少？,zh-CN,小于 1%,Correct,10%左右,Wrong,超过 20%,Very Wrong,小于 1%
-1656,在全世界所有孕妇中，有多少人是在没有经过受过教育的助产士、护士或医生检查的前提下分娩的？,zh-CN,15%左右,Correct,35%左右,Wrong,55%左右,Very Wrong,15%左右
-30,当今的低收入国家在 1970 年的预期寿命为 44 岁。现在的预期寿命是多少呢？,zh-CN,40 岁,Very Wrong,50 岁,Wrong,60 岁,Correct,60 岁
-1632,在全球范围内，超重和肥胖造成的死亡占多大比例？,zh-CN,小于 10%,Correct,25%左右,Wrong,40%左右,Very Wrong,小于 10%
-1513,从 2005 年到 2020 年，用于贫困国家粮食援助的资金达 570 亿美元。有多少钱用于研究能生产更多粮食或在极端天气下生存的新作物？,zh-CN,90 亿美元,Correct,390 亿美元,Wrong,690 亿美元,Very Wrong,90 亿美元
-1604,如今欧洲（包括俄罗斯和土耳其）有多少肥胖儿童？,zh-CN,大约 10%,Correct,大约 25%,Wrong,大约 40%,Very Wrong,大约 10%
-29,1995 年，所有国家共花费 46 亿美元用于农业出口补贴。2019 年花了多少？,zh-CN,1 亿美元,Correct,10 亿美元,Wrong,100 亿美元,Very Wrong,1 亿美元
-26,全世界有多少 5 岁以下儿童超重？,zh-CN,6%,Correct,26%,Wrong,46%,Very Wrong,6%
-28,以下哪个地区的 5 岁以下儿童体重严重不足的比例最高？,zh-CN,北非 & 中东,Very Wrong,南亚,Correct,撒哈拉以南非洲,Wrong,南亚
-25,世界上每生产 100 公斤粮食，有多少被运往其他国家？,zh-CN,17 公斤,Correct,37 公斤,Wrong,57 公斤,Very Wrong,17 公斤
-1530,世界上每天收入不足 2 美元的赤贫人口中，有多少生活在中等收入国家？,zh-CN,大约 15%,Very Wrong,大约 30%,Wrong,大约 60%,Correct,大约 60%
-22,在 195 个国家中，有多少个国家为残疾人提供某种形式的社会保障福利？,zh-CN,46,Very Wrong,116,Wrong,186,Correct,186
-6,世界上有多少人在家裡或家裡附近有安全的饮用水？,zh-CN,大约 50％,Very Wrong,大约 70％,Wrong,大约 90％,Correct,大约 90％
-1501_text,在全世界 195 个国家中，有多少个是联合国会员国？,zh-CN,93,Very Wrong,143,Wrong,193,Correct,193
-1666_text,高收入国家的人们因其富裕程度不同而排放不同数量的二氧化碳。从最穷的 10%到最富的 10%，每10%为一组，哪个选项能准确解释排放量的变化？,zh-CN,从最贫穷的 10%到最富有的 10%，排放量均匀地减少,Very Wrong,从最贫穷的 10%到最富有的 10%，排放量均匀地增加,Wrong,从最贫穷到最富有，排放量均匀增加，但到最富有的 10%，排放量迅速增加,Correct,从最贫穷到最富有，排放量均匀增加，但到最富有的 10%，排放量迅速增加
-1591_text,以下哪个答案最能说明在过去 40 年中，12 千米以下和 12 千米以上的大气层平均温度是如何变化的？,zh-CN,大气层的温度在 12 千米以上和以下都有所上升,Wrong,大气层温度在 12 千米以上降低，而在 12 千米以下升高,Correct,大气层温度在 12 千米以上升高，而在 12 千米以下降低,Very Wrong,大气层温度在 12 千米以上降低，而在 12 千米以下升高
-1507_text,据报道，1992 年有 56 名记者遇害。此后这个数量有什么变化？,zh-CN,急剧增加，2022 年达到 463 人遇害,Very Wrong,2022 年增加到 257 人遇害,Wrong,大致保持不变，2022年有67人遇害,Correct,大致保持不变，2022年有67人遇害
diff --git a/automation-api/yival_experiments/experiment_defaults.yaml b/automation-api/yival_experiments/experiment_defaults.yaml
deleted file mode 100644
index 6a22703..0000000
--- a/automation-api/yival_experiments/experiment_defaults.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-custom_wrappers:
-  model_config_wrapper:
-    class: ./custom_configuration/model_config_wrapper.ModelConfigWrapper
-    config_cls: ./custom_configuration/model_config_wrapper_config.ModelConfigWrapperConfig
-
-custom_variation_generators:
-  model_config_generator:
-    class: ./custom_configuration/model_config_variation_generator.ModelConfigVariationGenerator
-    config_cls: ./custom_configuration/model_config_variation_generator_config.ModelConfigVariationGeneratorConfig
-
-custom_evaluators:
-  gpt4_evaluator:
-    class: ./custom_configuration/gpt4_evaluator.GPT4Evaluator
-    config_cls: ./custom_configuration/gpt4_evaluator_config.GPT4EvaluatorConfig
-
-  vertex_ai_evaluator:
-    class: ./custom_configuration/vertex_ai_evaluator.VertexAIEvaluator
-    config_cls: ./custom_configuration/vertex_ai_evaluator_config.VertexAIEvaluatorConfig
-
-  llama3_evaluator:
-    class: ./custom_configuration/llama3_evaluator.Llama3Evaluator
-    config_cls: ./custom_configuration/llama3_evaluator_config.Llama3EvaluatorConfig
-
-  simple_evaluator:
-    class: ./custom_configuration/simple_evaluator.SimpleEvaluator
-    config_cls: ./custom_configuration/simple_evaluator_config.SimpleEvaluatorConfig
-
-custom_function: model_compare.model_compare
-
-dataset:
-  source_type: dataset
-  file_path: "data/questions_LANGUAGE.csv"
-  reader: csv_reader
-  reader_config:
-    expected_result_column: correct_answer
-
-
-description: Configuration for checking misconception in AI bots.
-
-# variations and evaluators will be generated with script.
diff --git a/automation-api/yival_experiments/experiment_example.yaml b/automation-api/yival_experiments/experiment_example.yaml
deleted file mode 100644
index 00a2cab..0000000
--- a/automation-api/yival_experiments/experiment_example.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-custom_wrappers:
-  model_config_wrapper:
-    class: ./model_config_wrapper.ModelConfigWrapper
-    config_cls: ./model_config_wrapper_config.ModelConfigWrapperConfig
-
-custom_variation_generators:
-  model_config_generator:
-    class: ./model_config_variation_generator.ModelConfigVariationGenerator
-    config_cls: ./model_config_variation_generator_config.ModelConfigVariationGeneratorConfig
-
-custom_function: model_compare.model_compare
-
-dataset:
-  source_type: dataset
-  file_path: "data/questions.csv"
-  reader: csv_reader
-  reader_config:
-    expected_result_column: correct_answer
-
-description: Example config
-
-evaluators:
-  # TODO: write custom evaluator to evaluate correct rate based on all records (evvaluator_type=all)
-  - evaluator_type: individual
-    metric_calculators:
-      - method: AVERAGE   # Note: only average is supported for now
-    name: openai_prompt_based_evaluator  # this evaluator will use gpt-4
-    prompt: |-
-      You are assessing a submitted answer on a given task based on a criterion. Here is the data:
-      - Task: Given a multiple choice question, pick the correct answer.
-      - Does the answer correct?
-      [Input]: {question_text}
-      A. {option_a}
-      B. {option_b}
-      C. {option_c}
-      Correctness of Options:
-      {option_a}: {option_a_correctness}
-      {option_b}: {option_b_correctness}
-      {option_c}: {option_c_correctness}
-      [Result]: {raw_output}
-      Answer the question by selecting one of the following options:
-      A It doesn't mention what is the answer to the multiple choice question at all.
-      B It mentions the answer to the multiple choice question, but it's not in the options list. (Not in option list)
-      C It mentions the answer to the multiple choice question, but the answer is far away from the correct answer. (Very Wrong)
-      D It mentions the answer to the multiple choice question, but the answer is not correct, though close to the correct one. (Wrong)
-      E It mentions the answer to the multiple choice question, and the answer is correct. (Correct)
-    display_name: correctness
-    choices: ["A", "B", "C", "D", "E"]
-    description: Does the answer correct?
-    scale_description: "0-4"
-    choice_scores:
-      A: 0
-      B: 1
-      C: 2
-      D: 3
-      E: 4
-
-
-variations:
-  - name: model_config
-    generator_name: model_config_generator
-    generator_config:
-      models:
-        - model_name: gpt-3.5-turbo
-          params:
-            temperature: 0
-        - model_name: gpt-3.5-turbo
-          params:
-            temperature: 1
-  - name: prompt_template
-    variations:
-      - instantiated_value: |
-          Answer following multiple choices question:
-          Question: {question_text}
-          A. {option_a}
-          B. {option_b}
-          C. {option_c}
-          Answer:
-        value: |
-          Answer following multiple choices question:
-          Question: {question_text}
-          A. {option_a}
-          B. {option_b}
-          C. {option_c}
-          Answer:
-        value_type: str
-        variation_id: instruct_question
-      - instantiated_value: |
-          Question: {question_text}
-          A. {option_a}
-          B. {option_b}
-          C. {option_c}
-          Answer:
-        value: |
-          Question: {question_text}
-          A. {option_a}
-          B. {option_b}
-          C. {option_c}
-          Answer:
-        value_type: str
-        variation_id: simple
-
-human_rating_configs:
-  - name: correctness
-    instructions: Rate whether the answer clearly state what the correct answer is
-    scale: [1, 5]
-
-  - name: coherence
-    instructions: Rate whether the answer and explanation are coherent
-    scale: [1, 5]
diff --git a/automation-api/yival_experiments/notebooks/compare_evaluators.py b/automation-api/yival_experiments/notebooks/compare_evaluators.py
deleted file mode 100644
index c80dbed..0000000
--- a/automation-api/yival_experiments/notebooks/compare_evaluators.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import duckdb
-import polars as pl
-
-
-results = pl.read_parquet("../output/results.parquet")
-
-results.columns
-
-df = results.filter(pl.col("model_id").str.contains("llama"))
-
-
-conn = duckdb.connect()
-
-
-simple_eval_check = conn.query("select * from df where simple_evaluator_matching <> auto_mark_correctness")
-simple_eval_check
-simple_eval_check.to_csv("./simple_eval_check.csv")
-
-
-# NEXT: review the query and begin to check results.
-q = """select
-  *
-from
-  df
-where
-  not (
-    llama3_evaluator_correctness = vertex_ai_evaluator_correctness
-    and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
-  )
-  or (
-    auto_mark_correctness <> 0
-    and (
-      llama3_evaluator_correctness = vertex_ai_evaluator_correctness
-      and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
-    )
-    and auto_mark_correctness <> gpt4_evaluator_correctness 
-  )
-  or (
-    simple_evaluator_matching <> 0
-    and (
-      llama3_evaluator_correctness = vertex_ai_evaluator_correctness
-      and gpt4_evaluator_correctness = vertex_ai_evaluator_correctness
-    )
-    and simple_evaluator_matching <> gpt4_evaluator_correctness 
-  )
-  
-  """
-
-diffs = conn.query(q)
-
-conn.query("select count(*) from df")
-conn.query("select count(*) from diffs")
-
-diffs.to_csv("to_check_all.csv")
-
-410 / 30780
-
-# FIXME: the simple evaluator seems not working very well?
-# just use the exact matching algo
-
diff --git a/automation-api/yival_experiments/notebooks/final_scores.py b/automation-api/yival_experiments/notebooks/final_scores.py
deleted file mode 100644
index d4dfe47..0000000
--- a/automation-api/yival_experiments/notebooks/final_scores.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:light
-#     text_representation:
-#       extension: .py
-#       format_name: light
-#       format_version: '1.5'
-#       jupytext_version: 1.16.1
-#   kernelspec:
-#     display_name: gapminder-ai-automation-api
-#     language: python
-#     name: gapminder-ai-automation-api
-# ---
-
-# calculate final scores for models
-
-# import libs
-from collections import Counter
-import polars as pl
-import pandas as pd
-from lib.config import read_config
-from lib.pilot.helpers import read_ai_eval_spreadsheet, get_questions, get_model_configs, get_prompt_variants
-
-# load env
-config = read_config()
-
-# load ai eval spreadsheet
-ai_eval_sheet = read_ai_eval_spreadsheet()
-
-results = ai_eval_sheet.evaluation_results.data.df.copy()
-
-# use polars
-results = pl.DataFrame(results)
-
-results.columns
-
-# rename the prompt_variation_id to match our report
-results.select(pl.col(['prompt_variation_id']).unique())
-
-
-prompt_id_mapping = {
-    'instruct_question_options_1': 'prompt1',
-    'instruct_question_options_2': 'prompt3',
-    'no_option_letter': 'prompt2',
-    'zh_no_option_letter': 'prompt2',
-    'zh_instruct_2': 'prompt3',
-    'zh_instruct_1': 'prompt1'
-}
-
-results = results.with_columns(
-    pl.col('prompt_variation_id').replace(prompt_id_mapping)
-)
-
-# double check
-results['prompt_variation_id'].unique()
-
-# create a mapping for model_id -> the actual brand, name and parameters
-model_configs = get_model_configs(ai_eval_sheet, include_all=True)
-
-
-def search_model(model_config_id):
-    for model, model_config in model_configs:
-        if model_config.model_config_id == model_config_id:
-            return ' '.join([
-                model.vendor, model.model_name, model_config.model_parameters])
-    raise ValueError(f'{model_config_id} not found!')
-
-
-model_config_ids = results['model_configuration_id'].unique().to_list()
-model_config_names = [search_model(x) for x in model_config_ids]
-model_config_id_mapping = dict(zip(model_config_ids, model_config_names))
-
-
-# replace nan to indecisive in result
-results = results.with_columns(
-    pl.col('result').replace({'nan': 'indecisive'})
-)
-
-# double check
-results['model_configuration_id'].unique()
-
-
-# Table 1. The number of different answers by model and prompt
-table1 = results.with_columns(
-    pl.concat_list(pl.col([
-        'percent_correct',
-        'percent_wrong',
-        'percent_very_wrong',
-        'percent_eval_failed'])).alias('tmp')
-).with_columns(
-    pl.col('tmp').map_elements(
-        lambda x: len(list(filter(lambda e: e != 0, x)))
-    ).alias('number_of_answers')
-).select(
-    pl.exclude('tmp')
-).group_by(['model_configuration_id', 'prompt_variation_id']).agg(
-    pl.col('number_of_answers').mean()
-)
-
-table1 = table1.with_columns(
-    pl.col('model_configuration_id').replace(model_config_id_mapping).alias('model_name')
-)
-
-table1
-
-table1.write_csv('../output/report_tables/1_number_of_average_answers.csv')
-
-# Table 2. Correct / Wrong / Very Wrong / Indecisive Rates
-table2 = results.group_by(
-    ['model_configuration_id', 'prompt_variation_id']
-).agg(
-    pl.col('result').count().alias('total_questions_asked'),
-    (pl.col('result').filter(pl.col('result') == 'correct').count()
-     / pl.col('result').count()
-     * 100).alias("Correct Rate %"),
-    (pl.col('result').filter(pl.col('result') == 'wrong').count()
-     / pl.col('result').count()
-     * 100).alias("Wrong Rate %"),
-    (pl.col('result').filter(pl.col('result') == 'very_wrong').count()
-     / pl.col('result').count()
-     * 100).alias("Very Wrong Rate %"),
-    (pl.col('result').filter(pl.col('result').is_in(['indecisive', 'fail'])).count()
-     / pl.col('result').count()
-     * 100).alias("Indecisive Rate %")
-)
-
-# double check
-table2.with_columns(
-    (pl.col('Correct Rate %') +
-     pl.col('Wrong Rate %') +
-     pl.col('Very Wrong Rate %') +
-     pl.col('Indecisive Rate %')).alias('total')
-)['total'].min()  # should be about 100
-
-table2 = table2.with_columns(
-    pl.col('model_configuration_id').replace(model_config_id_mapping).alias('model_name')
-)
-
-table2
-
-table2.write_csv('../output/report_tables/2_average_rates.csv')
-
-
-# Table 3. correct rate by prompt
-# don't use 20231104 result in this table. Because in that experiment
-# we didn't test prompt3.
-table3 = results.filter(
-    ~pl.col('last_evaluation_datetime').is_in(['20231104'])
-).group_by(
-    ['prompt_variation_id']
-).agg(
-    pl.col('result').count().alias('total_questions_asked'),
-    (pl.col('result').filter(pl.col('result') == 'correct').count()
-     / pl.col('result').count()
-     * 100).alias("Correct Rate %"),
-    (pl.col('result').filter(pl.col('result') == 'wrong').count()
-     / pl.col('result').count()
-     * 100).alias("Wrong Rate %"),
-    (pl.col('result').filter(pl.col('result') == 'very_wrong').count()
-     / pl.col('result').count()
-     * 100).alias("Very Wrong Rate %"),
-    (pl.col('result').filter(pl.col('result').is_in(['indecisive', 'fail'])).count()
-     / pl.col('result').count()
-     * 100).alias("Indecisive Rate %")
-)
-
-table3
-
-table3.write_csv('../output/report_tables/3_correct_rate_by_prompt.csv')
diff --git a/automation-api/yival_experiments/notebooks/human_rating.py b/automation-api/yival_experiments/notebooks/human_rating.py
deleted file mode 100644
index 0c491e0..0000000
--- a/automation-api/yival_experiments/notebooks/human_rating.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.16.2
-#   kernelspec:
-#     display_name: Python 3 (ipykernel)
-#     language: python
-#     name: python3
-# ---
-
-# %% [markdown]
-# # Usage
-#
-# Use this notebook to generate a file which contains results which auto mark correctness is different from LLM agent evaluator. Then merge the result back.
-
-# %% [markdown]
-# ## Generate file
-
-# %%
-# going to use duckdb
-# %load_ext sql
-
-# %%
-# %sql duckdb://
-
-# %%
-import os.path as osp
-import pandas as pd
-
-output_dir = '../output/'
-
-# %%
-result_file = osp.join(output_dir, 'results.xlsx')
-
-result_df = pd.read_excel(result_file)
-
-# %% magic_args="--save result_to_check_1 " language="sql"
-# select * 
-# from result_df 
-# where human_rating_score is null
-
-# %% magic_args="--save result_to_check_2" language="sql"
-# select 
-#     *,
-#     case 
-#         when correctness != 0 and auto_mark_correctness != correctness then 1
-#         when auto_mark_correctness = 0 and correctness = 0 then 1
-#     else 0 
-#     end as need_to_check
-# from result_to_check_1
-# where need_to_check = 1
-
-# %%
-# result_to_check = %sql select * exclude (need_to_check) from result_to_check_2
-
-# %%
-result_to_check_df = result_to_check.DataFrame()
-
-# %%
-result_to_check_df.shape
-
-# %%
-result_to_check_df['raw_output'] = result_to_check_df['raw_output'].str.strip()
-
-# %%
-result_to_check_df.to_excel(osp.join(output_dir, 'human_rating.xlsx'), index=False)
-
-# %%
-
-# %%
-
-# %%
-raise Exception("Please edit the human_rating file.")
-
-# %% [markdown]
-# ## Edit file, and then run below cells to merge back
-
-# %%
-rating_file = osp.join(output_dir, 'human_rating.xlsx')
-
-# %%
-human_ratings = pd.read_excel(rating_file)
-
-# %%
-human_ratings[~pd.isnull(human_ratings.human_rating_score)]
-
-# %%
-result_df_copy = result_df.copy()
-
-# %%
-result_df_copy = result_df_copy.reset_index()
-
-# %% magic_args="merged_results << " language="sql"
-# select 
-#     r.* exclude (human_rating_score, index),
-#     coalesce(l.human_rating_score, r.human_rating_score) as human_rating_score
-# from 
-#     result_df_copy r full join human_ratings l
-#     on (r.experiment_date = l.experiment_date 
-#     and r.question_id = l.question_id 
-#     and r.model_id = l.model_id 
-#     and r.model_params = l.model_params 
-#     and r.prompt_template = l.prompt_template)
-
-# %%
-merged_results_df = merged_results.DataFrame()
-
-# %%
-assert merged_results_df.shape == result_df.shape
-
-# %%
-result_df.shape
-
-# %%
-merged_results_df.drop_duplicates().shape
-
-# %% language="sql"
-# select
-#     *
-# from 
-#     result_df r anti join merged_results_df l
-#     on r.experiment_date = l.experiment_date 
-#     and r.question_id = l.question_id 
-#     and r.model_id = l.model_id 
-#     and r.model_params = l.model_params 
-#     and r.prompt_template = l.prompt_template
-
-# %%
-
-# %%
-merged_results_df[~pd.isnull(merged_results_df.human_rating_score)]
-
-# %%
-merged_results_df.to_excel(osp.join(output_dir, 'results.xlsx'), index=False)
-
-# %%
diff --git a/automation-api/yival_experiments/notebooks/result_data_analysis.py b/automation-api/yival_experiments/notebooks/result_data_analysis.py
deleted file mode 100644
index e7815db..0000000
--- a/automation-api/yival_experiments/notebooks/result_data_analysis.py
+++ /dev/null
@@ -1,1089 +0,0 @@
-# # Result Data Analysis
-#
-# This notebook is for producing tables listed in https://docs.google.com/spreadsheets/d/1ln5ui3f13AfAQkBuEMbNomBXlZLhkQPYVEpBlZjUtu0/edit?pli=1#gid=0
-#
-# Latest Update: 2024-10-02
-
-# going to use duckdb
-# %load_ext sql
-
-# %sql duckdb://
-
-import pandas as pd
-import polars as pl
-from lib.pilot.helpers import read_ai_eval_spreadsheet, get_questions, get_model_configs, get_prompt_variants
-from lib.config import read_config
-import matplotlib.pyplot as plt
-
-# load env
-config = read_config()
-
-
-
-# ## prepare data
-
-# results to be analyzed
-# manually download from AI eval spreadsheet.
-result = pd.concat([
-    pd.read_csv('./data/Gapminder AI evaluations - Master Output.csv'),
-    # pd.read_csv('./data/Gapminder AI evaluations - Latest Results.csv'),
-])
-
-# load ai eval spreadsheet
-ai_eval_sheet = read_ai_eval_spreadsheet()
-
-result
-
-# cleanup
-result.columns = result.columns.map(lambda x: x.lower().replace(' ', '_'))
-
-result
-
-
-
-# + magic_args="--save result_to_analyze " language="sql"
-# select
-#     *,
-#     CASE
-#     WHEN ((Result = 'correct')) THEN (3)
-#     WHEN ((Result = 'wrong')) THEN (2)
-#     WHEN ((Result = 'very_wrong')) THEN (1)
-#     WHEN ((Result = 'fail')) THEN (0)
-#     ELSE 0
-#   END AS score
-# from result
-# where model_configuration_id != 'mc026'  -- exclude qwen 1201
-
-# + magic_args="--with result_to_analyze --save result_chn_prompt_renamed" language="sql"
-# select
-#    * exclude (prompt_variation_id),
-#    replace(prompt_variation_id, '_zh', '') as prompt_variation_id
-# from result_to_analyze
-# -
-
-
-
-
-# models
-all_models = ai_eval_sheet.gen_ai_model_configs.data.df
-
-all_models.tail()
-
-# prompts
-all_prompts = ai_eval_sheet.prompt_variations.data.df
-
-all_prompts.tail()
-
-# all_prompts_filtered = %sql select variation_id, prompt_family, prompt_variation, language, question_template, question_prompt_template from all_prompts where prompt_family != 'none';
-all_prompts_filtered.DataFrame().to_csv('./data/outputs/prompts_table.csv', index=False)
-
-
-
-
-
-# question in eval sheet
-eval_questions = ai_eval_sheet.questions.data.df
-
-eval_questions.columns
-
-# all questions in contentful export
-all_questions = pd.read_csv('./data/contentful_questions_data.csv')
-
-# + magic_args="--save questions_and_topics" language="sql"
-# SELECT
-#   e."question_id",
-#   e."published_version_of_question",
-#   e."language",
-#   l.wrongPercentage AS human_wrong_percentage,
-#   str_split (l.included_in_tests_within_these_topic_ids, ';') AS topic_list,
-#   filter (topic_list, (x -> x like 'sdg-world-__')) [1] AS sdg_topic,
-#   filter (
-#     topic_list,
-#     (
-#       x -> list_contains (
-#         main.list_value (
-#           'refugees',
-#           'population',
-#           'sustainable-development-misconception-study-2020',
-#           '2017_gapminder_test',
-#           'climate-misconception-study-2024',
-#           'sdg-world-un-goals'
-#         ),
-#         x
-#       )
-#     )
-#   ) AS other_topics_list,
-#   list_string_agg(other_topics_list) as other_topics
-# FROM
-#   eval_questions AS e
-#   LEFT JOIN all_questions AS l ON (
-#     (
-#       replace(e."question_id", '_text', '') = CAST(l.globalId AS VARCHAR)
-#     )
-#   )
-# ORDER BY
-#   e."language",
-#   l.globalId;
-# -
-
-# export a csv for supplement tables
-# question_table = %sql select * exclude (topic_list, other_topics_list) from questions_and_topics;
-question_table_df = question_table.DataFrame()
-
-question_table_df.to_csv('./data/outputs/question_table.csv', index=False)
-
-
-
-
-
-# + magic_args="--save q_and_t" language="sql"
-# -- only keep question id and topic list.
-# select
-#     question_id,
-#     first(human_wrong_percentage) as human_wrong_percentage,
-#     first(topic_list) as topic_list,
-#     first(sdg_topic) as sdg_topic,
-#     first(other_topics_list) as other_topics
-# from questions_and_topics
-# group by question_id
-# -
-
-
-
-
-
-
-# ## Summary
-
-# ### Correctness Break Down by Model
-
-# + magic_args="result_by_models <<" language="sql"
-# select
-#     m.model_id as model_id,
-#     count(*) as total_count,
-#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
-#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
-#     100 - correct_rate as wrong_rate,
-#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate
-# from result_to_analyze r left join all_models m on r.model_configuration_id = m.model_config_id
-# GROUP BY m.model_id
-# order by correct_rate desc
-# -
-result_by_models_df = result_by_models.DataFrame()
-result_by_models_df
-
-
-
-# ### The Top 5 and Bottom 5 prompts of a model
-
-# + magic_args="--save by_prompt_and_model" language="sql"
-# select
-#     model_configuration_id,
-#     prompt_variation_id,
-#     count(*) as total_count,
-#     count(*) filter (result != 'fail') as total_count_exclude_indecisive,
-#     count(*) filter (result = 'correct') / total_count_exclude_indecisive * 100 as correct_rate,
-#     100 - correct_rate as wrong_rate,
-#     count(*) filter (result = 'fail') / total_count * 100 as indecisive_rate,
-#     row_number() over (partition by model_configuration_id order by correct_rate desc) as rank
-# from result_chn_prompt_renamed
-# GROUP BY prompt_variation_id, model_configuration_id
-
-# + magic_args="--save by_prompt_and_model_with_rank by_prompt_and_model_with_rank_df <<" language="sql"
-# select *
-# from by_prompt_and_model
-# where
-#    list_contains([1,2,3,4,5, 108, 107, 106, 105, 104], rank)
-# order by model_configuration_id, rank
-# -
-
-by_prompt_and_model_with_rank_df = by_prompt_and_model_with_rank_df.DataFrame()
-
-by_prompt_and_model_with_rank_df
-
-by_prompt_and_model_with_rank_df.to_csv('./data/outputs/new_prompt_model_bottoms.csv')
-
-# + magic_args="avg_model_correct_rate <<" language="sql"
-# select model_configuration_id, mean(correct_rate)
-# from by_prompt_and_model
-# group by model_configuration_id
-# order by model_configuration_id
-# -
-avg_model_correct_rate_df = avg_model_correct_rate.DataFrame()
-
-avg_model_correct_rate_df
-
-
-
-
-
-
-# ## Model, Prompt Family, Topic aggregations
-
-# + magic_args="--save res_with_prompt_family" language="sql"
-# select
-#     r.*,
-#     p.prompt_family
-# from result_to_analyze r left join all_prompts p on r.prompt_variation_id = p.variation_id
-
-# + magic_args="--save res_with_prompt_family_exclude_ind" language="sql"
-# select * from res_with_prompt_family where score != 0
-# -
-
-# ### highest variance by model
-
-# + magic_args="--save prompt_variance_stat" language="sql"
-# select
-#       model_configuration_id,
-#       question_id,
-#       stddev_pop(score) / mean (score) * 100 as variance,
-#       -- count(DISTINCT score) as variance
-#     from
-#       res_with_prompt_family_exclude_ind
-#     group by
-#       model_configuration_id,
-#       question_id
-#     order by
-#       "variance" desc
-
-# + magic_args="--save prompt_variance_stat_2" language="sql"
-# select 
-#     model_configuration_id,
-#     question_id,
-#     variance,
-#     rank() over (PARTITION by (model_configuration_id) order by variance desc) as rank
-# from prompt_variance_stat
-
-# + magic_args="high_variance_questions <<" language="sql"
-# select * from prompt_variance_stat_2 where rank <= 10
-# -
-
-high_variance_questions_df = high_variance_questions.DataFrame()
-
-high_variance_questions_df.to_csv('./data/outputs/new_high_variance_questions.csv', index=False)
-
-
-
-
-
-
-
-
-
-# ### Model vs Prompt Family
-
-# +
-# I need to check the variance cause by Prompt Family for each Model.
-# So I will first check the answer variance of each question, then get the average variance of all questions.
-
-# + magic_args="--save model_prompt_stat1" language="sql"
-# select
-#       prompt_family,
-#       model_configuration_id,
-#       question_id,
-#       count(*) as total_amount,
-#       count(*) filter (score = 3) / total_amount * 100 as correct_rate,
-#       stddev_pop(score) / mean (score) * 100 as variance,
-#       -- count(DISTINCT score) as variance
-#       mode(score) as mode_score
-#     from
-#       res_with_prompt_family_exclude_ind
-#     group by
-#       prompt_family,
-#       model_configuration_id,
-#       question_id
-#     order by
-#       "correct_rate" desc
-# + magic_args="--save model_prompt_stat2" language="sql"
-# select
-#       r.prompt_family,
-#       r.model_configuration_id,
-#       r.question_id,
-#       (1 - count(*) filter (r.score = s1.mode_score) / count(*)) * 100 as variance_2
-#       -- count(*)
-#     from
-#       res_with_prompt_family_exclude_ind r
-#     left join model_prompt_stat1 s1
-#     on
-#       r.prompt_family = s1.prompt_family AND
-#       r.model_configuration_id = s1.model_configuration_id AND
-#       r.question_id = s1.question_id
-#     group by
-#       r.prompt_family,
-#       r.model_configuration_id,
-#       r.question_id
-
-# + magic_args="--save model_prompt_stat3" language="sql"
-# select
-#       prompt_family,
-#       model_configuration_id,
-#       question_id,
-#       count(*) as total_amount,
-#       count(*) filter (score = 0) / total_amount * 100 as indecisive_rate
-#     from
-#       res_with_prompt_family
-#     group by
-#       prompt_family,
-#       model_configuration_id,
-#       question_id
-
-# + magic_args="model_prompt_stats <<" language="sql"
-# select
-#   r1.prompt_family,
-#   r1.model_configuration_id,
-#   mean (correct_rate) as cr,
-#   mean (indecisive_rate) as ir,
-#   mean (variance) as variance
-# from
-#   model_prompt_stat1 r1
-#   left join model_prompt_stat2 r2 on r1.prompt_family = r2.prompt_family
-#       and r1.model_configuration_id = r2.model_configuration_id
-#       and r1.question_id = r2.question_id
-#   left join model_prompt_stat3 r3 on r1.prompt_family = r3.prompt_family
-#       and r1.model_configuration_id = r3.model_configuration_id
-#       and r1.question_id = r3.question_id
-# group by
-#   r1.prompt_family,
-#   r1.model_configuration_id
-# order by
-#   r1.model_configuration_id,
-#   r1.prompt_family
-#
-# -
-
-tmp_df1 = model_prompt_stats.DataFrame()
-
-tmp_df1.set_index(['prompt_family', 'model_configuration_id'])
-
-tmp_df1.to_csv('./data/outputs/new_model_vs_prompt_family.csv', index=False)
-
-
-
-
-
-# ### Model vs Topic
-# Same as above, need to calculate variance per question first and get the average.
-
-# + magic_args="--save model_question_stat1" language="sql"
-# select
-#     question_id,
-#     model_configuration_id,
-#     count(*) filter (
-#       score = 3
-#     ) / count(*) * 100 as correct_rate,
-#     stddev_pop(score) / mean(score) * 100 as variance
-#     -- count(DISTINCT score) as variance
-#   from
-#     (select * from result_to_analyze where score != 0)
-#   group by
-#     question_id,
-#     model_configuration_id
-
-# + magic_args="--save model_question_stat2" language="sql"
-#   select
-#     question_id,
-#     model_configuration_id,
-#     count(*) filter (
-#       score = 0
-#     ) / count(*) * 100 as indecisive_rate
-#   from
-#     result_to_analyze
-#   group by
-#     question_id,
-#     model_configuration_id
-
-# + magic_args="--save model_question_stat_all" language="sql"
-# select
-#     r1.*,
-#     r2.indecisive_rate
-#   from
-#     model_question_stat1 r1
-#   left join model_question_stat2 r2 on
-#     r1.question_id = r2.question_id
-#     and r1.model_configuration_id = r2.model_configuration_id
-
-# + magic_args="--save model_topic_stat" language="sql"
-#   select
-#     r.*,
-#     q.sdg_topic,
-#     q.other_topics,
-#     q.human_wrong_percentage,
-#     case
-#       when q.sdg_topic is null then other_topics
-#       else list_append(q.other_topics, q.sdg_topic)
-#     end as all_topics
-#
-#   from
-#     model_question_stat_all r
-#   left join q_and_t q on
-#     r.question_id = q.question_id
-
-# + magic_args="--with model_topic_stat model_topic_res <<" language="sql"
-# select
-#   model_configuration_id,
-#   topic,
-#   count(*) as "number of qs",
-#   mean (correct_rate) as correct_rate,
-#   mean (indecisive_rate) as indecisive_rate,
-#   mode (variance) as variance
-# from
-#   (
-#     select
-#       * exclude (all_topics, sdg_topic, other_topics),
-#       unnest (all_topics) as topic
-#     from
-#       model_topic_stat
-#   )
-# group by
-#   topic,
-#   model_configuration_id
-# order by
-#   topic,
-#   model_configuration_id
-# -
-
-model_topic_res_df = model_topic_res.DataFrame().set_index(['model_configuration_id', 'topic'])
-
-model_topic_res_df.to_csv('./data/outputs/new_model_vs_topic.csv')
-
-model_topic_res_df.describe()
-
-
-
-
-
-
-
-# ## Questions where AI worse than human and monkey
-
-# ### human score
-
-100 - all_questions['wrongPercentage'].mean()
-
-
-
-# + language="sql"
-# select * from model_topic_stat;
-
-# + magic_args="model_topic_diff <<" language="sql"
-# select
-#   question_id,
-#   model_configuration_id,
-#     (100 - correct_rate) as ai_wrong_percentage,
-#     human_wrong_percentage,
-#     2/3 * 100 as monkey_wrong_percentage,
-#   ai_wrong_percentage - human_wrong_percentage as compare_to_human,
-#     ai_wrong_percentage - monkey_wrong_percentage as compare_to_monkey,
-#     sdg_topic,
-#     other_topics
-# from model_topic_stat
-# where compare_to_human > 0 OR compare_to_monkey > 0
-# order by
-#     "sdg_topic",
-#     cast(other_topics as varchar),
-#     "model_configuration_id"
-# -
-
-model_topic_diff
-
-model_topic_diff_df = model_topic_diff.DataFrame()
-
-model_topic_diff_df.shape
-
-model_topic_diff_df.to_csv('./data/outputs/new_ai_worse_all.csv', index=False)
-
-
-
-# +
-# make a complete list combining worse than human and worse than monkey
-
-# + magic_args="all_worse_questions <<" language="sql"
-# select question_id, model_configuration_id 
-# from
-#     model_topic_diff_df
-
-# + magic_args="very_wrong_res <<" language="sql"
-# select * from result_to_analyze where result = 'very_wrong'
-# -
-
-
-
-# +
-# now find one case for very wrong for these questions.
-# -
-
-r1 = all_worse_questions.DataFrame()
-r2 = very_wrong_res.DataFrame()
-
-r2_ = r2.groupby(['question_id', 'model_configuration_id']).agg(lambda x: x.sample(1)).reset_index()
-
-# + magic_args="--save all_worse_very_wrong" language="sql"
-# select 
-#     r1.question_id, r1.model_configuration_id, prompt_variation_id
-# from 
-#      r1 
-#     left join 
-#      r2_ 
-#     on 
-#         r1.question_id = r2_.question_id and r1.model_configuration_id = r2_.model_configuration_id
-
-# + language="sql"
-# select *
-# from r1
-# where 
-#     question_id = '1640' and model_configuration_id = 'mc039'
-
-# + language="sql"
-# select *
-# from r2_
-# where 
-#     question_id = '1640' and model_configuration_id = 'mc039'
-
-# +
-# Why??? Because there is no very wrong answer for this combination!
-# -
-
-
-
-# all_worse_very_wrong_df = %sql select * from all_worse_very_wrong
-
-all_worse_very_wrong_df = all_worse_very_wrong_df.DataFrame()
-
-all_worse_very_wrong_df[pd.isnull(all_worse_very_wrong_df['prompt_variation_id'])]
-
-
-
-
-
-# +
-# query example responses
-# but first, we need to read all result data...
-# -
-
-# FIXME: change ../output/results.parquet to correct archive path.
-raw_data_fs = [
-    '../output/results.parquet',  # for mc039
-    '../output/archives/20240521/results.xlsx',
-    '../output/archives/20240401/results.xlsx',
-    '../output/archives/20240501/results.xlsx',
-    '../output/archives/20240516/results.xlsx',
-    '../output/archives/20240601/results.xlsx',
-    '../output/archives/20240910/results.xlsx'
-]
-
-pd.read_parquet(raw_data_fs[0]).columns
-
-# +
-cols = ['experiment_date', 'question_id', 'model_id', 'prompt_template', 'question', 'raw_output']
-
-raw_data_lst = list()
-
-for x in raw_data_fs:
-    if 'parquet' in x:
-        raw_data_lst.append(pd.read_parquet(x)[cols])
-    else:
-        raw_data_lst.append(pd.read_excel(x)[cols])
-# -
-
-raw_data = pd.concat(raw_data_lst, ignore_index=True)
-
-raw_data
-
-# fix a few experiment model id
-raw_data.loc[raw_data['model_id'] == 'gpt-4', 'model_id'] = 'gpt-4-0613' 
-raw_data.loc[raw_data['model_id'] == 'gpt-4o', 'model_id'] = 'gpt-4o-2024-05-13' 
-
-
-
-
-
-# +
-# now we should make all columns we needed
-# 1. question and answers
-# 2. prompt template
-# 3. model configuration id
-# -
-
-# first do prompt template
-# load all configuration files and get a mapping.
-import yaml
-
-sorted([str(x) for x in raw_data['experiment_date'].unique()])
-
-configuration_list = [
-    '../experiment_configurations/experiment_202403291214_gpt-4-0125-preview_en-US.yaml',
-    '../experiment_configurations/experiment_202403291248_gemini_gemini-1-0-pro_en-US.yaml',
-    '../experiment_configurations/experiment_202403291536_gemini_gemini-1-0-pro_en-US.yaml',
-    '../experiment_configurations/experiment_202404011622_qwen-max-1201_zh-CN.yaml',
-    '../experiment_configurations/experiment_202404051719_gpt-4-0125-preview_en-US.yaml',
-    '../experiment_configurations/experiment_202404102325_qwen-max-1201_zh-CN.yaml',
-    '../experiment_configurations/experiment_202404201136_vertex_ai_gemini-1-5-pro_en-US.yaml',
-    '../experiment_configurations/experiment_202404201344_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml',
-    '../experiment_configurations/experiment_202405012311_qwen-max-0403_zh-CN.yaml',
-    '../experiment_configurations/experiment_202405162215_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml',
-    '../experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml',
-    '../experiment_configurations/experiment_202405162244_qwen-max-0403_zh-CN.yaml',
-    '../experiment_configurations/experiment_202405242125_gpt-4o-2024-05-13_en-US.yaml',
-    '../experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml',
-    '../experiment_configurations/experiment_202405291053_vertex_ai_claude-3-opus@20240229_en-US.yaml',
-    '../experiment_configurations/experiment_202406040141_qwen-max-0428_en-US.yaml',
-    '../experiment_configurations/experiment_202408291204_gpt-4o-2024-08-06_en-US.yaml',
-    '../experiment_configurations/experiment_202408310828_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml',
-    '../experiment_configurations/experiment_202409102304_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml',
-    '../experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml',
-]
-
-# +
-prompt_template_list = list()
-
-for x in configuration_list:
-    c = yaml.safe_load(open(x, 'r'))
-    p = pd.DataFrame.from_records(c['variations'][1]['variations'])
-    prompt_template_list.append(p)
-# -
-
-all_prompt_templates = pd.concat(prompt_template_list, ignore_index=True)
-
-all_prompt_templates = all_prompt_templates.drop_duplicates(subset=['value'])
-
-all_prompt_templates_mapping = all_prompt_templates.set_index('value')['variation_id'].to_dict()
-
-for k, v in all_prompt_templates_mapping.items():
-    print(k)
-    print(v)
-    break
-
-
-
-raw_data['prompt_template'].map(all_prompt_templates_mapping).hasnans  # should be False
-
-raw_data['prompt_variation_id'] = raw_data['prompt_template'].map(all_prompt_templates_mapping)
-
-
-
-# +
-# next add model_configuration_id
-# -
-
-# all_models_ = %sql select * from all_models where repeat_times = 1
-
-all_models_ = all_models_.DataFrame()
-
-all_models_mapping = all_models_.set_index('model_id')['model_config_id'].to_dict()
-
-raw_data['model_id'].map(all_models_mapping).hasnans
-
-raw_data['model_configuration_id'] = raw_data['model_id'].map(all_models_mapping)
-
-raw_data
-
-
-
-# + language="sql"
-# select
-#     DISTINCT model_id 
-# from 
-#     raw_data
-# where
-#     prompt_variation_id like '%zh%'
-# -
-
-
-
-
-
-
-
-
-
-# +
-# questions and answers mapping
-# -
-
-all_questions.columns
-
-qs = ai_eval_sheet.questions.data.df.copy()
-qs = qs[['question_id', 'language', 'published_version_of_question']]
-
-qs
-
-q_dict = qs.set_index(["question_id", "language"])["published_version_of_question"].to_dict()
-
-# +
-ans = ai_eval_sheet.question_options.data.df.copy()
-ans_dict = dict()
-
-for qid, adf in ans.groupby(["question_id", "language"]):
-    adict = adf.set_index('letter')['question_option'].to_dict()
-    ans_dict[qid] = adict
-# -
-
-ans_dict[("1", "en-US")]
-
-q_dict[("1", "en-US")]
-
-
-
-# +
-# create final output
-# -
-
-all_worse_very_wrong_df
-
-raw_data.dtypes
-
-raw_data['experiment_date'] = raw_data['experiment_date'].map(lambda x: str(x))
-raw_data['question_id'] = raw_data['question_id'].map(lambda x: str(x))
-raw_data['model_id'] = raw_data['model_id'].map(lambda x: str(x))
-
-raw_data_pl = pl.from_pandas(raw_data)
-
-# +
-raw_output_lst = list()
-prompt_lst = list()
-
-
-for _, row in all_worse_very_wrong_df.iterrows():
-    question_id = row['question_id']
-    model_configuration_id = row['model_configuration_id']
-    prompt_variation_id = row['prompt_variation_id']
-    # print(question_id, model_configuration_id, prompt_variation_id)
-
-    raw_data_row = raw_data_pl.filter(
-        (pl.col('question_id') == question_id) & (pl.col('model_configuration_id') == model_configuration_id) & (pl.col('prompt_variation_id') == prompt_variation_id)
-    )
-
-    if raw_data_row.is_empty():
-        raw_output_lst.append(None)
-        prompt_lst.append(None)
-    else:
-        question_text = raw_data_row['question'].item()
-        question_id = raw_data_row['question_id'].item()
-        language = 'zh-CN' if '_zh' in prompt_variation_id else 'en-US'
-        answers = ans_dict[(question_id, language)]
-        option_a = answers['A']
-        option_b = answers['B']
-        option_c = answers['C']
-
-        prompt_template = raw_data_row['prompt_template'].item()
-        prompt = prompt_template.format(question_text=question_text, option_a=option_a, option_b=option_b, option_c=option_c)
-        # print(prompt)
-
-        prompt_lst.append(prompt)
-        raw_output_lst.append(raw_data_row['raw_output'].item())
-    
-# -
-raw_data_row
-
-all_worse_very_wrong_df['prompt'] = prompt_lst
-all_worse_very_wrong_df['model_output'] = raw_output_lst
-
-all_worse_very_wrong_df
-
-all_worse_very_wrong_df.to_csv('./data/outputs/new_ai_worse_sample.csv', index=False)
-
-
-
-
-
-
-
-# ## Examples for high variance questions
-
-high_variance_questions_df
-
-# + language="sql"
-# select * from result_to_analyze
-# -
-
-
-
-question_id = '1792'
-model_configuration_id = 'mc039'
-grade = 'very_wrong'
-
-
-# + magic_args="--save grade_example" language="sql"
-#
-# select * from
-#     (
-#     select * from result_to_analyze
-#     where
-#         question_id = '{{question_id}}' 
-#         and model_configuration_id = '{{model_configuration_id}}' 
-#         and result = '{{grade}}'
-#     )
-# using sample 1
-# -
-
-def filter_grade(question_id, model_configuration_id, grade):
-    # res = %sql select * from (select * from result_to_analyze where question_id = '{{question_id}}' and model_configuration_id = '{{model_configuration_id}}' and result = '{{grade}}') using sample 1
-    return res
-
-
-filter_grade(question_id, model_configuration_id, grade)
-
-
-
-
-
-# +
-correct_lst = list()
-wrong_lst = list()
-very_wrong_lst = list()
-correct_prompt_lst = list()
-wrong_prompt_lst = list()
-very_wrong_prompt_lst = list()
-
-output_lists = [correct_lst, wrong_lst, very_wrong_lst]
-prompt_lists = [correct_prompt_lst, wrong_prompt_lst, very_wrong_prompt_lst]
-
-for _, row in high_variance_questions_df.iterrows():
-    question_id = row['question_id']
-    model_configuration_id = row['model_configuration_id']
-    # prompt_variation_id = row['prompt_variation_id']
-    # print(question_id, model_configuration_id)
-
-    examples = list()
-    for g in ['correct', 'wrong', 'very_wrong']:
-        grade = g
-        example = filter_grade(question_id, model_configuration_id, grade)
-        # print(example)
-        if len(example) > 0:
-            e = next(example.dicts())
-            assert e['result'] == grade
-            examples.append(e)
-        else:
-            examples.append(None)
-
-    for i, e in enumerate(examples):
-        if e:
-            prompt_variation_id = e['prompt_variation_id']
-            raw_data_row = raw_data_pl.filter(
-                (pl.col('question_id') == question_id) 
-                & (pl.col('model_configuration_id') == model_configuration_id) 
-                & (pl.col('prompt_variation_id') == prompt_variation_id)
-            )
-            if raw_data_row.is_empty():
-                print(question_id, model_configuration_id, prompt_variation_id)
-                output_lists[i].append(None)
-                prompt_lists[i].append(None)
-                continue
-            question_text = raw_data_row['question'].item()
-            language = 'zh-CN' if '_zh' in prompt_variation_id else 'en-US'
-            answers = ans_dict[(question_id, language)]
-            option_a = answers['A']
-            option_b = answers['B']
-            option_c = answers['C']
-            prompt_template = raw_data_row['prompt_template'].item()
-            prompt = prompt_template.format(question_text=question_text, option_a=option_a, option_b=option_b, option_c=option_c)
-            output_lists[i].append(raw_data_row['raw_output'].item())
-            prompt_lists[i].append(prompt)
-        else:
-            output_lists[i].append(None)
-            prompt_lists[i].append(None)
-
-# -
-prompt_lists[0][0]
-
-prompt_lists[1][0]
-
-
-
-
-
-
-
-high_variance_questions_df['correct_prompt_example'] = prompt_lists[0]
-high_variance_questions_df['correct_answer_example'] = output_lists[0]
-high_variance_questions_df['wrong_prompt_example'] = prompt_lists[1]
-high_variance_questions_df['wrong_answer_example'] = output_lists[1]
-high_variance_questions_df['very_wrong_prompt_example'] = prompt_lists[2]
-high_variance_questions_df['very_wrong_answer_example'] = output_lists[2]
-
-high_variance_questions_df
-
-high_variance_questions_df.to_csv('./data/outputs/new_high_variance_questions_sample.csv', index=False)
-
-
-
-
-
-# ## Questions where AI scores best
-
-# + magic_args="ai_best_questions <<" language="sql"
-# select 
-#     question_id,
-#     mean(correct_rate) as avg_correct_rate,
-#     mean(indecisive_rate) as avg_inde_rate,
-#     mean(variance) as avg_variance,
-# from model_topic_stat
-# group by question_id
-# order by avg_correct_rate desc, avg_inde_rate
-# ;
-# -
-
-ai_best_questions_df = ai_best_questions.DataFrame()
-
-ai_best_questions_df.head(15)
-
-
-
-
-
-# # for double checking the evaluators
-# check the top 10, bottom 10 questions per model
-
-# + magic_args="--save double_check_results" language="sql"
-# select
-#   question_id,
-#   model_configuration_id,
-#     (100 - correct_rate) as ai_wrong_percentage,
-#     human_wrong_percentage,
-#   ai_wrong_percentage - human_wrong_percentage as diff,
-#     sdg_topic,
-#     other_topics
-# from model_topic_stat
-# -- where diff > 0
-# order by
-#     "sdg_topic",
-#     cast(other_topics as varchar),
-#     "model_configuration_id"
-
-# + language="sql"
-# select * 
-# from double_check_results 
-# where model_configuration_id = 'mc026' AND ai_wrong_percentage = 0
-# order by question_id
-
-# + magic_args="--save double_check_results_1" language="sql"
-# select
-#     model_configuration_id,
-#     question_id,
-#     ai_wrong_percentage,
-#     rank() over (partition by model_configuration_id order by ai_wrong_percentage) as rank
-# from double_check_results
-# order by model_configuration_id, rank, question_id
-
-# + magic_args="to_check <<" language="sql"
-#
-# select * from double_check_results_1 where rank <= 10 OR rank >= 275
-# -
-
-to_check_df = to_check.DataFrame()
-
-to_check_df[to_check_df['model_configuration_id'] == 'mc026']
-
-
-
-
-
-
-
-# # for climate study questions
-
-climate_questions = ["5", "59", "85", "86", "1524", "1672", "1691", "1706", "1717", "1730", "1731", "1737", "1738", "1741", "1761"]
-
-# + magic_args="--save result_climate_questions" language="sql"
-# select
-#     *
-# from result_to_analyze
-# where list_contains({{climate_questions}}, question_id) AND model_configuration_id != 'mc028';
-# -
-
-# climate_raw_result = %sql select * from result_climate_questions
-
-climate_raw_result.DataFrame().to_csv('./data/outputs/climate_raw.csv', index=False)
-
-# + magic_args="--save correct_by_prompt climate_res << " language="sql"
-# select
-#     model_configuration_id,
-#     prompt_variation_id,
-#     count(*),
-#
-# from result_climate_questions
-# where result = 'correct'
-# group by model_configuration_id, prompt_variation_id
-# -
-
-climate_res.DataFrame().to_csv("./data/outputs/climate_study.csv")
-
-# +
-# another way to calculate correctness
-
-# + magic_args="--save climate_question_correctness" language="sql"
-# select
-#     model_configuration_id,
-#     count(*) filter (result != 'fail') as total_count,
-#     count(*) filter (result = 'correct') as correct_count,
-#     correct_count / total_count * 100 as correct_rate,
-#     correct_rate * 15 / 100 as correct_num_average
-# from result_climate_questions
-# group by model_configuration_id
-
-# + language="sql"
-# select mean(correct_num_average) from climate_question_correctness;
-# -
-
-
-
-# + magic_args="--save climate_question_correctness" language="sql"
-# select
-#     count(*) filter (result != 'fail') as total_count,
-#     count(*) filter (result = 'correct') as correct_count,
-#     correct_count / total_count * 100 as correct_rate,
-#     correct_rate * 15 / 100 as correct_num_average
-# from result_climate_questions
-# -
-
-34/3
-
-
-
-
-
-
-
-# # Check raw outputs
-
-outputs1 = pd.read_excel('../output/archives/20240401/results.xlsx')
-outputs2 = pd.read_excel('../output/results.xlsx')
-
-outputs = pd.concat([outputs1, outputs2], ignore_index=True)
-
-outputs
-
-
-
-outputs.to_parquet("./data/outputs/latest_results.parquet")
-
-
-
-# alibaba = %sql select * from outputs where model_id = 'qwen-max-0403'
-# err = %sql select * from outputs where model_id = 'qwen-max-0403' and raw_output like '%Error%'
-
-err.DataFrame().head(10)
-
-# +
-# Issue: Seems the gpt 4 evaluator grades some Error and indecisive answers as "correct"..
-# -
-
-
-
-err.DataFrame().shape
-
-alibaba.DataFrame().shape
-
-60 / 30348  # still have 0.1% of API Error
-
-# + magic_args="err2 <<" language="sql"
-# select * from outputs where model_id = 'qwen-max-0403' and
-#  (raw_output like '%抱歉%'
-#     OR raw_output like '%遗憾%'
-#     OR raw_output like '%对不起%'
-#     OR raw_output like '%无法%')  -- these are answers including the word "Sorry" or "I can't"
-# -
-
-err2.DataFrame()
-
-err2.DataFrame().shape
diff --git a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py b/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
deleted file mode 100644
index 36083e1..0000000
--- a/automation-api/yival_experiments/notebooks/upload_to_ai_eval_sheet.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:light
-#     text_representation:
-#       extension: .py
-#       format_name: light
-#       format_version: '1.5'
-#       jupytext_version: 1.16.2
-#   kernelspec:
-#     display_name: gapminder-ai-automation-api
-#     language: python
-#     name: gapminder-ai-automation-api
-# ---
-
-"""A script to create and upload result table to AI Eval Spreadsheet.
-"""
-
-import re
-import json
-import numpy as np
-import pandas as pd
-import polars as pl
-import yaml
-from datetime import datetime
-from langdetect import detect
-from lib.pilot.helpers import read_ai_eval_spreadsheet, get_questions, get_model_configs, get_prompt_variants
-from lib.config import read_config
-
-# load env
-config = read_config()
-
-raw_results = pd.read_excel('../output/results.xlsx')
-# also, save it as parquet, for easier loading into other tools
-# raw_results.to_parquet('../notebooks/data/raw_results_experiment_3.parquet')
-
-# set question_id field to string
-raw_results['question_id'] = raw_results['question_id'].astype(str)
-
-raw_results
-
-# load AI Eval Spreadsheet
-ai_eval_sheet = read_ai_eval_spreadsheet()
-
-# create a mapping from question text -> question_id + language pair.
-questions = get_questions(ai_eval_sheet, include_all=True)
-
-
-# Possible Issue: the question is gone or changed in Ai Eval spreadsheet
-# so we need to detect the language if we can't find that question.
-# Here is a function to detect if an input string is English or Chinese
-def suggest_language(q_text):
-    lang = detect(q_text)
-    if lang == 'en':
-        return 'en-US'
-    else:
-        return 'zh-CN'
-
-
-q_text_to_q_id_mapping = {}
-
-for _, row in raw_results[['question_id', 'question']].drop_duplicates().iterrows():
-    q_text = row['question']
-    q_id = row['question_id']
-    matched = False
-    for q, _ in questions:
-        if q_id == q.question_id:
-            if q_text.strip() == q.published_version_of_question.strip():
-                matched = True
-                q_text_to_q_id_mapping[q_text] = (q.question_id, q.language)
-            else:
-                lang = suggest_language(q_text)
-                if lang == q.language:
-                    matched = True
-                    q_text_to_q_id_mapping[q_text] = (q.question_id, q.language)
-                    print(f"Q{q_id} have different question text.")
-                    print(q_text.strip())
-                    print(q.published_version_of_question.strip())
-        if matched:
-            break
-
-    if not matched:
-        lang = suggest_language(q_text)
-        print(q_id, q_text[:10], '...', 'does not exist, detected lang:', lang)
-        q_text_to_q_id_mapping[q_text] = (q_id, lang)
-
-
-# q_text_to_q_id_mapping
-len(q_text_to_q_id_mapping)
-
-# double check: numbers of english questions and chinese questions
-en = list(filter(lambda v: v[1] == 'en-US', q_text_to_q_id_mapping.values()))
-en_ids = [x[0] for x in en]
-cn = list(filter(lambda v: v[1] == 'zh-CN', q_text_to_q_id_mapping.values()))
-cn_ids = [x[0] for x in cn]
-
-# this should output an empty set
-# if English question set and Chinese question set are the same.
-set(en_ids) - set(cn_ids)
-set(cn_ids) - set(en_ids)
-
-# +
-# fix for experiment 20231104: the gpt-4 is gpt-4-0613
-raw_results.loc[raw_results['model_id'] == 'gpt-4', 'model_id'] = 'gpt-4-0613'
-
-# fix for experiment 20240521: gpt-4o is gpt-4o-2024-05-13
-raw_results.loc[raw_results['model_id'] == 'gpt-4o', 'model_id'] = 'gpt-4o-2024-05-13'
-# -
-
-
-# create a mapping from model_id, parameters -> model_config id
-# NOTE: because we only search for model_id and parameters,
-# we may found duplicates: same model_id and parameters,
-# but different rounds/memory settings.
-# That's why I don't include all rows here, we should manually ensure that
-# the model we actually tested are enabled in the AI eval sheet.
-# TODO: see if we can auto detect the correct model configuration.
-model_configs = get_model_configs(ai_eval_sheet, include_all=False)
-
-
-model_id_params_to_model_config_mapping = {}
-for model_id, params in raw_results[['model_id', 'model_params']].drop_duplicates().values:
-    matched = False
-    for model, conf in model_configs:
-        if model.model_id == model_id and params == str(json.loads(conf.model_parameters)):
-            if matched:  # we found a duplicate
-                print("duplicated rows found for model conf:",
-                      model_id,
-                      params)
-                raise ValueError("duplicated rows")
-            model_id_params_to_model_config_mapping[(model_id, params)] = conf.model_config_id
-            matched = True
-    if not matched:
-        print(model_id,
-              params,
-              "not found. Please ensure it's enabled in the AI Eval Spreadsheet.")
-        raise KeyError("model configuration not exist")
-
-
-model_id_params_to_model_config_mapping
-
-raise Exception("Please check if file names are correct in next cell.")
-
-# create a mapping from prompt_variant_text -> prompt_variant_id
-# to get the most accurate mapping, we will load the prompts from the experiment files
-# be sure to change the name
-cn_exp_config = yaml.safe_load(open('../experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml', 'r'))
-en_exp_config = yaml.safe_load(open('../experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml', 'r'))
-
-assert cn_exp_config['variations'][1]['name'] == 'prompt_template'
-assert en_exp_config['variations'][1]['name'] == 'prompt_template'
-
-cn_prompts = pd.DataFrame.from_records(cn_exp_config['variations'][1]['variations'])
-en_prompts = pd.DataFrame.from_records(en_exp_config['variations'][1]['variations'])
-
-all_prompts = pd.concat([cn_prompts, en_prompts], ignore_index=True)
-
-prompt_text_to_prompt_id_mapping = all_prompts.set_index('value')['variation_id'].to_dict()
-
-prompt_text_to_prompt_id_mapping
-
-# convert the raw result to a dataframe with labelled data.
-result = raw_results.copy()
-
-# convert to question id.
-result['language'] = result['question'].map(lambda x: q_text_to_q_id_mapping[x][1])
-
-# convert to prompt variant id
-result['prompt_variant_id'] = result['prompt_template'].map(lambda x: prompt_text_to_prompt_id_mapping[x])
-
-# convert to model_conf_id
-result['model_conf_id'] = [model_id_params_to_model_config_mapping[
-    (row['model_id'], row['model_params'])] for _, row in result.iterrows()]
-
-# update the correctness column with human scores
-result['final_score'] = (result['human_rating_score']
-    .fillna(result['vertex_ai_evaluator_correctness'])
-    .fillna(result['gpt4_evaluator_correctness'])
-)
-
-result[pd.isnull(result["final_score"])]  # this sould be empty
-
-# counting
-# let's use polars from now
-result = pl.DataFrame(result)
-result
-
-# concat all evaluation results as list
-result = result.with_columns(
-    pl.concat_list(pl.col(['gpt4_evaluator_gpt4_eval_correctness',
-         'vertex_ai_evaluator_gemini_eval_correctness',
-         'vertex_ai_evaluator_claude_eval_correctness',])).alias("evaluation_results")
-)
-
-# +
-# result.group_by(
-#     ['question_id', 'language', 'prompt_variant_id', 'model_conf_id']
-# ).agg(
-#     pl.col('correctness').value_counts()
-# )
-# -
-
-# FIX: In experiment on 2024-04, due to issue in prompt we tested the ideology-capitalist_zh prompt for alibaba twice.
-# we will just keep one.
-result = result.group_by(
-    ['question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date']
-).agg(pl.all().first())
-
-
-
-
-# then calculate the distribution
-result_counts = result.with_columns(
-    pl.col('evaluation_results').list.count_matches(0).alias('fail'),
-    pl.col('evaluation_results').list.count_matches(1).alias('very_wrong'),
-    pl.col('evaluation_results').list.count_matches(2).alias('wrong'),
-    pl.col('evaluation_results').list.count_matches(3).alias('correct'),
-)
-
-result_counts
-
-result_counts['rounds'].max()
-
-
-# set the number of evaluators
-num_of_evaluators = 3
-
-result_pct = result_counts.with_columns(
-    pl.col('fail') / num_of_evaluators * 100,
-    pl.col('very_wrong') / num_of_evaluators * 100,
-    pl.col('wrong') / num_of_evaluators * 100,
-    pl.col('correct') / num_of_evaluators * 100,
-)
-
-result_pct
-
-# calculate the final grade
-def get_grade(dictionary):
-    max_value = max(dictionary.values())
-    max_keys = [key for key, value in dictionary.items() if value == max_value]
-
-    if len(max_keys) > 1:
-        return "n/a"
-    else:
-        return max_keys[0]
-
-
-result_full = result_pct.with_columns(
-    pl.struct(pl.col(['fail', 'very_wrong', 'wrong', 'correct'])).map_elements(get_grade).alias('result'),
-    pl.lit(1).alias('rounds')
-)
-
-
-# then if we have human ratings, update the results.
-result_full = result_full.with_columns(
-    pl.col('human_rating_score').replace(
-        dict(enumerate(['fail', 'very_wrong', 'wrong', 'correct']))
-    ).fill_null(pl.col('result')).alias('result')
-)
-
-
-result_full
-
-result_full_df = result_full.to_pandas()
-result_full_df.columns
-
-
-result_full_df = result_full_df.loc[:, 
-    [
-        'question_id', 'language', 'prompt_variant_id', 'model_conf_id', 'experiment_date',
-          'fail', 'very_wrong', 'wrong', 'correct', 'rounds', 'result', 
-    ]
-]
-
-result_full_df.columns = ['question_id', 'language', 'prompt_variation_id',
-                          'model_configuration_id', 'last_evaluation_datetime',
-                          'percent_eval_failed', 'percent_very_wrong', 'percent_wrong',
-                          'percent_correct', 'rounds', 'result']
-
-backup = ai_eval_sheet.evaluation_results.data.df.copy()
-
-backup.columns
-
-result_full_df = result_full_df[backup.columns]
-
-ai_eval_sheet.evaluation_results.replace_data(result_full_df)
-
-
diff --git a/automation-api/yival_experiments/output/report_tables/1_number_of_average_answers.csv b/automation-api/yival_experiments/output/report_tables/1_number_of_average_answers.csv
deleted file mode 100644
index 87abce1..0000000
--- a/automation-api/yival_experiments/output/report_tables/1_number_of_average_answers.csv
+++ /dev/null
@@ -1,35 +0,0 @@
-model_configuration_id,prompt_variation_id,number_of_answers,model_name
-mc019,prompt3,1.5,"Google PaLM (Chat Bison) {""temperature"": 0.01}"
-mc024,prompt2,1.0607142857142857,"OpenAI GPT4 Turbo Jan 2024 {""temperature"": 0.01}"
-mc021,prompt1,1.167857142857143,"OpenAI GPT4 Turbo Nov 2023 {""temperature"": 0.01}"
-mc023,prompt2,1.6535714285714285,"Alibaba Qianwen Max {""temperature"": 0.01}"
-mc015,prompt2,1.0613026819923372,"Meta llama2 (hosted on replicate) {""temperature"": 0.01}"
-mc019,prompt2,1.0,"Google PaLM (Chat Bison) {""temperature"": 0.01}"
-mc018,prompt1,1.7633587786259541,"OpenAI GPT3.5 June 2023 {""temperature"": 1}"
-mc016,prompt2,1.0190839694656488,"OpenAI GPT3.5 June 2023 {""temperature"": 0.01}"
-mc015,prompt1,1.0766283524904214,"Meta llama2 (hosted on replicate) {""temperature"": 0.01}"
-mc021,prompt3,1.042857142857143,"OpenAI GPT4 Turbo Nov 2023 {""temperature"": 0.01}"
-mc024,prompt1,1.0571428571428572,"OpenAI GPT4 Turbo Jan 2024 {""temperature"": 0.01}"
-mc023,prompt3,1.4392857142857143,"Alibaba Qianwen Max {""temperature"": 0.01}"
-mc015,prompt3,1.0,"Meta llama2 (hosted on replicate) {""temperature"": 0.01}"
-mc020,prompt1,1.0214285714285714,"Google Gemini Pro {""temperature"": 0.01}"
-mc018,prompt2,1.6641221374045803,"OpenAI GPT3.5 June 2023 {""temperature"": 1}"
-mc014,prompt2,1.0240384615384615,"Alibaba Qianwen Plus {""top_p"": 0.1, ""top_k"": 100}"
-mc014,prompt1,1.0096153846153846,"Alibaba Qianwen Plus {""top_p"": 0.1, ""top_k"": 100}"
-mc021,prompt2,1.1,"OpenAI GPT4 Turbo Nov 2023 {""temperature"": 0.01}"
-mc009,prompt2,1.7644787644787645,"Google PaLM (Text Bison) {""temperature"": 0.01}"
-mc022,prompt3,1.1321428571428571,"OpenAI GPT3.5 Nov 2023' {""temperature"": 0.01}"
-mc016,prompt3,1.0,"OpenAI GPT3.5 June 2023 {""temperature"": 0.01}"
-mc009,prompt1,1.9034749034749034,"Google PaLM (Text Bison) {""temperature"": 0.01}"
-mc023,prompt1,1.4214285714285715,"Alibaba Qianwen Max {""temperature"": 0.01}"
-mc024,prompt3,1.05,"OpenAI GPT4 Turbo Jan 2024 {""temperature"": 0.01}"
-mc017,prompt1,1.1603053435114503,"OpenAI GPT4 June 2023 {""temperature"": 0.01}"
-mc020,prompt3,1.0178571428571428,"Google Gemini Pro {""temperature"": 0.01}"
-mc020,prompt2,1.0,"Google Gemini Pro {""temperature"": 0.01}"
-mc019,prompt1,1.0,"Google PaLM (Chat Bison) {""temperature"": 0.01}"
-mc017,prompt2,1.1488549618320612,"OpenAI GPT4 June 2023 {""temperature"": 0.01}"
-mc016,prompt1,1.0458015267175573,"OpenAI GPT3.5 June 2023 {""temperature"": 0.01}"
-mc018,prompt3,1.5,"OpenAI GPT3.5 June 2023 {""temperature"": 1}"
-mc017,prompt3,1.0,"OpenAI GPT4 June 2023 {""temperature"": 0.01}"
-mc022,prompt1,1.0857142857142856,"OpenAI GPT3.5 Nov 2023' {""temperature"": 0.01}"
-mc022,prompt2,1.1178571428571429,"OpenAI GPT3.5 Nov 2023' {""temperature"": 0.01}"
diff --git a/automation-api/yival_experiments/output/report_tables/2_average_rates.csv b/automation-api/yival_experiments/output/report_tables/2_average_rates.csv
deleted file mode 100644
index e23b0a8..0000000
--- a/automation-api/yival_experiments/output/report_tables/2_average_rates.csv
+++ /dev/null
@@ -1,35 +0,0 @@
-model_configuration_id,prompt_variation_id,total_questions_asked,Correct Rate %,Wrong Rate %,Very Wrong Rate %,Indecisive Rate %,model_name
-mc019,prompt1,2,50.0,0.0,50.0,0.0,"Google PaLM (Chat Bison) {""temperature"": 0.01}"
-mc021,prompt2,280,82.5,12.5,2.5,2.5,"OpenAI GPT4 Turbo Nov 2023 {""temperature"": 0.01}"
-mc022,prompt2,280,65.0,25.0,8.928571428571429,1.0714285714285714,"OpenAI GPT3.5 Nov 2023' {""temperature"": 0.01}"
-mc024,prompt1,280,79.64285714285714,16.785714285714285,3.571428571428571,0.0,"OpenAI GPT4 Turbo Jan 2024 {""temperature"": 0.01}"
-mc015,prompt2,261,50.191570881226056,36.7816091954023,9.961685823754788,3.065134099616858,"Meta llama2 (hosted on replicate) {""temperature"": 0.01}"
-mc019,prompt3,2,50.0,50.0,0.0,0.0,"Google PaLM (Chat Bison) {""temperature"": 0.01}"
-mc020,prompt2,280,63.92857142857142,26.071428571428573,9.642857142857144,0.35714285714285715,"Google Gemini Pro {""temperature"": 0.01}"
-mc023,prompt1,280,64.64285714285715,15.714285714285714,8.928571428571429,10.714285714285714,"Alibaba Qianwen Max {""temperature"": 0.01}"
-mc024,prompt3,280,74.28571428571429,23.57142857142857,2.142857142857143,0.0,"OpenAI GPT4 Turbo Jan 2024 {""temperature"": 0.01}"
-mc014,prompt1,208,36.53846153846153,32.21153846153847,12.01923076923077,19.230769230769234,"Alibaba Qianwen Plus {""top_p"": 0.1, ""top_k"": 100}"
-mc023,prompt3,280,62.142857142857146,21.428571428571427,8.928571428571429,7.5,"Alibaba Qianwen Max {""temperature"": 0.01}"
-mc016,prompt2,262,67.93893129770993,14.50381679389313,11.068702290076336,6.488549618320611,"OpenAI GPT3.5 June 2023 {""temperature"": 0.01}"
-mc016,prompt1,262,38.93129770992366,45.80152671755725,15.267175572519085,0.0,"OpenAI GPT3.5 June 2023 {""temperature"": 0.01}"
-mc017,prompt1,262,59.16030534351145,27.099236641221374,2.2900763358778624,11.450381679389313,"OpenAI GPT4 June 2023 {""temperature"": 0.01}"
-mc018,prompt2,262,61.06870229007634,13.740458015267176,9.923664122137405,15.267175572519085,"OpenAI GPT3.5 June 2023 {""temperature"": 1}"
-mc018,prompt1,262,34.73282442748092,41.603053435114504,17.17557251908397,6.488549618320611,"OpenAI GPT3.5 June 2023 {""temperature"": 1}"
-mc016,prompt3,2,50.0,50.0,0.0,0.0,"OpenAI GPT3.5 June 2023 {""temperature"": 0.01}"
-mc017,prompt2,262,63.358778625954194,15.267175572519085,0.7633587786259541,20.610687022900763,"OpenAI GPT4 June 2023 {""temperature"": 0.01}"
-mc017,prompt3,2,50.0,0.0,0.0,50.0,"OpenAI GPT4 June 2023 {""temperature"": 0.01}"
-mc018,prompt3,2,0.0,100.0,0.0,0.0,"OpenAI GPT3.5 June 2023 {""temperature"": 1}"
-mc014,prompt2,208,40.38461538461539,21.153846153846153,13.461538461538462,25.0,"Alibaba Qianwen Plus {""top_p"": 0.1, ""top_k"": 100}"
-mc020,prompt1,280,53.57142857142857,37.857142857142854,8.571428571428571,0.0,"Google Gemini Pro {""temperature"": 0.01}"
-mc023,prompt2,280,65.0,17.857142857142858,3.571428571428571,13.571428571428571,"Alibaba Qianwen Max {""temperature"": 0.01}"
-mc015,prompt1,261,31.800766283524908,50.191570881226056,16.091954022988507,1.9157088122605364,"Meta llama2 (hosted on replicate) {""temperature"": 0.01}"
-mc019,prompt2,2,100.0,0.0,0.0,0.0,"Google PaLM (Chat Bison) {""temperature"": 0.01}"
-mc020,prompt3,280,50.357142857142854,39.64285714285714,10.0,0.0,"Google Gemini Pro {""temperature"": 0.01}"
-mc021,prompt1,280,78.57142857142857,18.21428571428571,2.5,0.7142857142857143,"OpenAI GPT4 Turbo Nov 2023 {""temperature"": 0.01}"
-mc022,prompt1,280,37.142857142857146,48.214285714285715,14.285714285714285,0.35714285714285715,"OpenAI GPT3.5 Nov 2023' {""temperature"": 0.01}"
-mc009,prompt1,259,38.996138996138995,27.7992277992278,22.393822393822393,10.81081081081081,"Google PaLM (Text Bison) {""temperature"": 0.01}"
-mc009,prompt2,259,36.293436293436294,42.084942084942085,14.671814671814673,6.94980694980695,"Google PaLM (Text Bison) {""temperature"": 0.01}"
-mc015,prompt3,2,50.0,50.0,0.0,0.0,"Meta llama2 (hosted on replicate) {""temperature"": 0.01}"
-mc021,prompt3,280,76.42857142857142,21.071428571428573,2.5,0.0,"OpenAI GPT4 Turbo Nov 2023 {""temperature"": 0.01}"
-mc022,prompt3,280,35.0,50.0,15.0,0.0,"OpenAI GPT3.5 Nov 2023' {""temperature"": 0.01}"
-mc024,prompt2,280,84.28571428571429,13.214285714285715,2.142857142857143,0.35714285714285715,"OpenAI GPT4 Turbo Jan 2024 {""temperature"": 0.01}"
diff --git a/automation-api/yival_experiments/output/report_tables/3_correct_rate_by_prompt.csv b/automation-api/yival_experiments/output/report_tables/3_correct_rate_by_prompt.csv
deleted file mode 100644
index b2ad37e..0000000
--- a/automation-api/yival_experiments/output/report_tables/3_correct_rate_by_prompt.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-prompt_variation_id,total_questions_asked,Correct Rate %,Wrong Rate %,Very Wrong Rate %,Indecisive Rate %
-prompt1,1410,62.5531914893617,27.4468085106383,7.588652482269503,2.4113475177304964
-prompt2,1410,72.12765957446808,18.79432624113475,5.319148936170213,3.7588652482269502
-prompt3,1410,59.50354609929078,31.27659574468085,7.659574468085106,1.5602836879432624
diff --git a/automation-api/yival_experiments/scripts/fetch_questions.py b/automation-api/yival_experiments/scripts/fetch_questions.py
deleted file mode 100644
index 14ab6b8..0000000
--- a/automation-api/yival_experiments/scripts/fetch_questions.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from pathlib import Path
-
-import pandas as pd
-
-from lib.pilot.helpers import get_questions, read_ai_eval_spreadsheet
-
-current_script_path = Path(__file__).parent
-
-
-correctness_map = {1: "Correct", 2: "Wrong", 3: "Very Wrong"}
-
-
-def main():
-    print("Reading AI eval spreadsheet")
-    sheet = read_ai_eval_spreadsheet()
-    print("Getting questions")
-    questions = get_questions(sheet)
-
-    if len(questions) == 0:
-        print("Empty Question set. Please double check the Questions sheet.")
-        return
-
-    output_list = []
-
-    for q, opts in questions:
-        output_item = {
-            "question_id": q.question_id,
-            "question_text": q.published_version_of_question.strip(),
-            "language": q.language,
-        }
-
-        # sometimes option letter is missing. We will keep a list of available
-        # letters for this situation.
-        available_letters = ["a", "b", "c"]
-        for opt in opts:
-            letter = opt.letter.lower()
-            if letter in available_letters:
-                available_letters.remove(letter)
-
-        for opt in opts:
-            letter = opt.letter.lower()
-            if letter not in ["a", "b", "c"]:
-                letter = available_letters.pop(0)  # pick one available
-            output_item[f"option_{letter}"] = opt.question_option
-            output_item[f"option_{letter}_correctness"] = correctness_map[
-                opt.correctness_of_answer_option
-            ]
-            if opt.correctness_of_answer_option == 1:
-                output_item["correct_answer"] = opt.question_option
-
-        # detect any null columns
-        for k, v in output_item.items():
-            if pd.isnull(v):
-                raise ValueError(f"nan found in item: {output_item}")
-
-        output_list.append(output_item)
-
-    output_df = pd.DataFrame.from_records(output_list)
-
-    # Grouping the DataFrame by 'language'
-    grouped = output_df.groupby("language")
-
-    for language, group in grouped:
-        # Constructing the filename for each language
-        output_file = current_script_path / f"../data/questions_{language}.csv"
-        # Saving each group to a separate CSV file
-        group.to_csv(output_file, index=False)
-        print(f"Questions in '{language}' language saved to {output_file}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/automation-api/yival_experiments/scripts/generate_experiment_config.py b/automation-api/yival_experiments/scripts/generate_experiment_config.py
deleted file mode 100644
index f89b471..0000000
--- a/automation-api/yival_experiments/scripts/generate_experiment_config.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import os
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Dict, List
-
-import yaml
-
-from lib.ai_eval_spreadsheet.schemas import PromptVariation
-from lib.ai_eval_spreadsheet.wrapper import AiEvalData
-from lib.pilot.helpers import (
-    ModelAndConfig,
-    get_metrics,
-    get_model_configs,
-    get_prompt_variants,
-    load_model_parameters,
-    read_ai_eval_spreadsheet,
-)
-
-current_script_path = Path(__file__).parent
-
-# to make pyyaml's dumper generate good looking strings
-# https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
-def str_presenter(dumper, data):
-    if len(data.splitlines()) > 1:  # check for multiline string
-        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
-    return dumper.represent_scalar("tag:yaml.org,2002:str", data)
-
-
-yaml.add_representer(str, str_presenter)
-# -
-
-base_configs_path = current_script_path / "../experiment_defaults.yaml"
-experiment_configurations_path = current_script_path / "../experiment_configurations/"
-latest_experiment_path = current_script_path / "../experiment_latest.yaml"
-
-
-def get_evaluators(
-    ai_eval_sheet: AiEvalData, evaluator_model="gpt4"
-) -> List[Dict[str, Any]]:
-    metrics = get_metrics(ai_eval_sheet)
-    res = list()
-
-    if evaluator_model == "gpt4":
-        evaluator_name = "gpt4_evaluator"
-        model_name = "gpt-4o-2024-11-20"
-    elif evaluator_model == "claude":
-        evaluator_name = "vertex_ai_evaluator"
-        model_name = "vertex_ai/claude-3-5-sonnet@20240620"
-    elif evaluator_model == "llama":
-        evaluator_name = "llama3_evaluator"
-        model_name = "replicate/meta/meta-llama-3.1-405b-instruct"
-    elif evaluator_model == "gemini":
-        evaluator_name = "vertex_ai_evaluator"
-        model_name = "vertex_ai/gemini-1.5-pro-002"
-    else:
-        raise ValueError(f"{evaluator_model} is not a supported evaluator")
-
-    for m in metrics:
-        metric: Dict[str, Any] = dict()
-        metric["evaluator_type"] = "individual"
-        metric["metric_calculators"] = [{"method": "AVERAGE"}]
-        metric["name"] = evaluator_name
-        metric["model_name"] = model_name
-        metric["prompt"] = m.prompt
-        metric["choices"] = m.choices.split(", ")
-        metric["description"] = m.description
-        metric["choice_scores"] = dict(
-            zip(m.choices.split(", "), map(int, m.choice_scores.split(", ")))
-        )
-        metric["scale_description"] = "{}-{}".format(
-            m.choice_scores[0], m.choice_scores[-1]
-        )
-        metric["display_name"] = f"{evaluator_model}_{m.name}"
-        res.append(metric)
-
-    return res
-
-
-def get_model_variations_yaml_dict(model_configs: List[ModelAndConfig]):
-    res: Dict[str, Any] = dict()
-    res["name"] = "model_config"
-    res["generator_name"] = "model_config_generator"
-    variant_list = list()
-    for model, config in model_configs:
-        for t in range(config.repeat_times):
-            model_dict: Dict[str, Any] = dict()
-            model_dict["vendor"] = model.vendor
-            model_dict["model_id"] = model.model_id
-            model_dict["params"] = load_model_parameters(config.model_parameters)
-            model_dict["round"] = t + 1
-            variant_list.append(model_dict)
-
-    res["generator_config"] = {"models": variant_list}
-    return res
-
-
-def get_prompt_variations_yaml_dict(prompt_variations: List[PromptVariation]):
-    res: Dict[str, Any] = dict()
-    res["name"] = "prompt_template"
-    variant_list = list()
-    for p in prompt_variations:
-        variant_dict = dict()
-        variant_dict["variation_id"] = p.variation_id
-        variant_dict["value_type"] = "str"
-        value = p.question_prompt_template.format(question=p.question_template)
-        variant_dict["instantiated_value"] = value
-        variant_dict["value"] = value
-        variant_list.append(variant_dict)
-
-    res["variations"] = variant_list
-    return res
-
-
-def main():
-    print("Reading AI eval spreadsheet")
-    sheet = read_ai_eval_spreadsheet()
-    # load default config
-    config = yaml.load(open(base_configs_path, "r"), Loader=yaml.Loader)
-
-    # metrics
-    evaluators = list()
-    for evaluator_model in ["gpt4", "claude", "gemini"]:
-        evaluators.extend(get_evaluators(sheet, evaluator_model=evaluator_model))
-    config["evaluators"] = evaluators
-
-    # also append a simple evaluator
-    simple_evaluator = {
-        "evaluator_type": "individual",
-        "metric_calculators": [{"method": "AVERAGE"}],
-        "name": "simple_evaluator",
-    }
-    config["evaluators"].append(simple_evaluator)
-
-    # model configs and prompt variations
-    model_configs = get_model_configs(sheet)
-    model_ids = {model.model_id for model, model_config in model_configs}
-    prompt_variations = get_prompt_variants(sheet)
-    prompt_variation_languages = {
-        prompt_variation.language for prompt_variation in prompt_variations
-    }
-
-    experiment_names = []
-    for model_id in model_ids:
-        model_id_specific_model_configurations = [
-            (model, model_config)
-            for model, model_config in model_configs
-            if model.model_id == model_id
-        ]
-        model_configs_yaml_dict = get_model_variations_yaml_dict(
-            model_id_specific_model_configurations
-        )
-
-        for prompt_variation_language in prompt_variation_languages:
-
-            # filter out prompt variations that are not in the current language
-            language_specific_prompt_variations = [
-                prompt_variation
-                for prompt_variation in prompt_variations
-                if prompt_variation.language == prompt_variation_language
-            ]
-
-            # compile the configuration
-            prompt_variations_yaml_dict = get_prompt_variations_yaml_dict(
-                language_specific_prompt_variations
-            )
-            config["variations"] = [
-                model_configs_yaml_dict,
-                prompt_variations_yaml_dict,
-            ]
-            config["dataset"][
-                "file_path"
-            ] = f"data/questions_{prompt_variation_language}.csv"
-
-            # create configuration yaml file
-            os.makedirs(experiment_configurations_path, exist_ok=True)
-            now = datetime.now()
-            yival_sanitized_model_id = model_id.replace("/", "_").replace(".", "-")
-            experiment_name = f'experiment_{now.strftime("%Y%m%d%H%M")}_{yival_sanitized_model_id}_{prompt_variation_language}'
-            file_name = f"{experiment_name}.yaml"
-            output_file = experiment_configurations_path / file_name
-            with open(output_file, "w") as f:
-                yaml.dump(config, stream=f, sort_keys=False, allow_unicode=True)
-                f.close()
-            experiment_names.append(experiment_name)
-
-    print(
-        f"Experiment configurations saved to {experiment_configurations_path}. To run them:"
-    )
-    for experiment_name in experiment_names:
-        print(f"  poe run_experiment --experiment={experiment_name}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/automation-api/yival_experiments/scripts/generate_result.py b/automation-api/yival_experiments/scripts/generate_result.py
deleted file mode 100644
index 3ca1e9d..0000000
--- a/automation-api/yival_experiments/scripts/generate_result.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import os.path as osp
-import pickle
-from glob import glob
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from yival.experiment.experiment_runner import Experiment
-
-current_script_path = Path(__file__).parent
-
-# all Yival experiment results are exported into pickle files.
-# you can use follow code to explore the structure.
-# change fp variable to the pickle file path
-# fp = "path/to/result.pkl"
-# data: Experiment = pickle.load(open(fp, "rb"))
-# data.group_experiment_results[0].asdict()
-# result = data.group_experiment_results[1]
-# rs = result.experiment_results
-# len(rs)
-# rs[1].asdict()
-
-# We will combine all pickle files in output dir and calculate final scores.
-# In this script, we store all responses into an excel file.
-output_dir = current_script_path / "../output"
-
-option_score_mapping = {"Correct": 3, "Wrong": 2, "Very Wrong": 1}
-
-
-def exact_match_correctness(answer, options, correctness):
-    option_occurance = [0, 0, 0]
-    scores = [option_score_mapping[x] for x in correctness]
-    for i, o in zip(range(3), options):
-        if o.strip().lower() in answer.strip().lower():
-            option_occurance[i] = 1
-    if sum(option_occurance) == 1:
-        score = scores[option_occurance.index(1)]
-    else:
-        score = 0
-
-    return score
-
-
-def extract_correct_answer(options, correctness):
-    for t, c in zip(options, correctness):
-        if c == "Correct":
-            return t
-
-
-if __name__ == "__main__":
-    output_list = []
-
-    for fp in glob(f"{output_dir}/*.pkl"):
-        # Note: we assumed that the filenames are begging with "experiment_${date}_"
-        # so that we can extract the date from result files.
-        expr_date = osp.basename(fp).split("_")[1][:8]
-        data: Experiment = pickle.load(open(fp, "rb"))
-        for group_results in data.group_experiment_results:
-            for result in group_results.experiment_results:
-                row = result.input_data.content
-                answer = result.raw_output.text_output
-                option_a = row["option_a"]
-                option_b = row["option_b"]
-                option_c = row["option_c"]
-                option_a_correctness = row["option_a_correctness"]
-                option_b_correctness = row["option_b_correctness"]
-                option_c_correctness = row["option_c_correctness"]
-                options = [option_a, option_b, option_c]
-                correctness = [
-                    option_a_correctness,
-                    option_b_correctness,
-                    option_c_correctness,
-                ]
-                auto_mark_correctness = exact_match_correctness(
-                    answer, options, correctness
-                )
-                correct_answer = extract_correct_answer(options, correctness)
-                result_dict = dict(
-                    experiment_date=expr_date,
-                    question_id=str(result.input_data.content["question_id"]),
-                    model_id=result.combination["model_config"]["model_id"],
-                    model_params=str(result.combination["model_config"]["params"]),
-                    prompt_template=result.combination["prompt_template"],
-                    question=result.input_data.content["question_text"],
-                    raw_output=result.raw_output.text_output,
-                    correct_answer=correct_answer,
-                    auto_mark_correctness=auto_mark_correctness,
-                )
-                for eval_output in result.evaluator_outputs:
-                    col_name = f"{eval_output.name}_{eval_output.display_name}"
-                    result_dict[col_name] = eval_output.result
-
-                output_list.append(result_dict)
-
-    output_df = pd.DataFrame.from_records(output_list)
-    # add a human rating column
-    output_df["human_rating_score"] = np.nan
-    output_df.to_excel(osp.join(output_dir, "results.xlsx"), index=False)
-    output_df.to_parquet(osp.join(output_dir, "results.parquet"), index=False)
-
-    print("done")
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_20231104_cn.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_20231104_cn.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_20231104_cn.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_20231104_cn.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_20231104_en.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_20231104_en.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_20231104_en.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_20231104_en.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202401260846_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202401260846_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202401260846_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202401260846_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202401281713_zh-CN.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202401281713_zh-CN.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202401281713_zh-CN.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202401281713_zh-CN.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202401292237_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202401292237_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202401292237_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202401292237_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gemini_gemini-pro_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gemini_gemini-pro_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gemini_gemini-pro_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gemini_gemini-pro_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-0613_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-0613_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-0613_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-0613_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-1106_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-1106_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-1106_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-3-5-turbo-1106_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-4-0125-preview_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-4-0125-preview_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-4-0125-preview_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-4-0125-preview_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-4-0613_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-4-0613_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-4-0613_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-4-0613_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-4-1106-preview_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-4-1106-preview_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402011555_gpt-4-1106-preview_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_gpt-4-1106-preview_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402011555_palm_chat-bison_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_palm_chat-bison_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402011555_palm_chat-bison_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_palm_chat-bison_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402011555_replicate_llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_replicate_llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402011555_replicate_llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402011555_replicate_llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402012248_zh-CN.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402012248_zh-CN.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402012248_zh-CN.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402012248_zh-CN.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402212350_gpt-4-0125-preview_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402212350_gpt-4-0125-preview_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402212350_gpt-4-0125-preview_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402212350_gpt-4-0125-preview_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202402271117_gemini_gemini-1-0-pro_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202402271117_gemini_gemini-1-0-pro_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202402271117_gemini_gemini-1-0-pro_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202402271117_gemini_gemini-1-0-pro_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202403061101_qwen-max-1201_zh-CN.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202403061101_qwen-max-1201_zh-CN.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202403061101_qwen-max-1201_zh-CN.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202403061101_qwen-max-1201_zh-CN.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202403291214_gpt-4-0125-preview_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202403291214_gpt-4-0125-preview_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202403291214_gpt-4-0125-preview_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202403291214_gpt-4-0125-preview_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202403291248_gemini_gemini-1-0-pro_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202403291248_gemini_gemini-1-0-pro_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202403291248_gemini_gemini-1-0-pro_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202403291248_gemini_gemini-1-0-pro_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202403291536_gemini_gemini-1-0-pro_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202403291536_gemini_gemini-1-0-pro_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202403291536_gemini_gemini-1-0-pro_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202403291536_gemini_gemini-1-0-pro_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202404011622_qwen-max-1201_zh-CN.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202404011622_qwen-max-1201_zh-CN.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202404011622_qwen-max-1201_zh-CN.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202404011622_qwen-max-1201_zh-CN.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202404051719_gpt-4-0125-preview_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202404051719_gpt-4-0125-preview_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202404051719_gpt-4-0125-preview_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202404051719_gpt-4-0125-preview_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202404102325_qwen-max-1201_zh-CN.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202404102325_qwen-max-1201_zh-CN.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202404102325_qwen-max-1201_zh-CN.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202404102325_qwen-max-1201_zh-CN.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202404201136_vertex_ai_gemini-1-5-pro_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202404201136_vertex_ai_gemini-1-5-pro_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202404201136_vertex_ai_gemini-1-5-pro_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202404201136_vertex_ai_gemini-1-5-pro_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202404201344_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202404201344_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202404201344_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202404201344_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202405012311_qwen-max-0403_zh-CN.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202405012311_qwen-max-0403_zh-CN.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202405012311_qwen-max-0403_zh-CN.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202405012311_qwen-max-0403_zh-CN.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202405162215_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202405162215_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202405162215_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202405162215_vertex_ai_gemini-1-5-pro-preview-0409_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202405162244_qwen-max-0403_zh-CN.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202405162244_qwen-max-0403_zh-CN.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202405162244_qwen-max-0403_zh-CN.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202405162244_qwen-max-0403_zh-CN.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202405162248_qwen-max-0403_zh-CN.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202405242125_gpt-4o-2024-05-13_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202405242125_gpt-4o-2024-05-13_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202405242125_gpt-4o-2024-05-13_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202405242125_gpt-4o-2024-05-13_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202405281300_replicate_meta_meta-llama-3-70b-instruct_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202405291053_vertex_ai_claude-3-opus@20240229_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202405291053_vertex_ai_claude-3-opus@20240229_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202405291053_vertex_ai_claude-3-opus@20240229_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202405291053_vertex_ai_claude-3-opus@20240229_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202406040141_qwen-max-0428_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202406040141_qwen-max-0428_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202406040141_qwen-max-0428_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202406040141_qwen-max-0428_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202408291204_gpt-4o-2024-08-06_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202408291204_gpt-4o-2024-08-06_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202408291204_gpt-4o-2024-08-06_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202408291204_gpt-4o-2024-08-06_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202408310828_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202408310828_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202408310828_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202408310828_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202409102304_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202409102304_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202409102304_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202409102304_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202409211350_qwen-max-2024-09-19_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202411221101_xai_grok-beta_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202411221101_xai_grok-beta_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202411221101_xai_grok-beta_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202411221101_xai_grok-beta_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202412052345_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202412052345_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202412052345_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202412052345_fireworks_ai_accounts_fireworks_models_llama-v3p1-405b-instruct_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202412052345_gpt-4o-2024-08-06_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202412052345_gpt-4o-2024-08-06_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202412052345_gpt-4o-2024-08-06_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202412052345_gpt-4o-2024-08-06_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202412052345_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202412052345_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202412052345_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202412052345_vertex_ai_claude-3-5-sonnet@20240620_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202412060713_qwen-max-2024-09-19_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202412060713_qwen-max-2024-09-19_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202412060713_qwen-max-2024-09-19_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202412060713_qwen-max-2024-09-19_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202412061047_vertex_ai_gemini-1-5-pro-002_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202412061047_vertex_ai_gemini-1-5-pro-002_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202412061047_vertex_ai_gemini-1-5-pro-002_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202412061047_vertex_ai_gemini-1-5-pro-002_en-US.yaml
diff --git a/automation-api/yival_experiments/experiment_configurations/experiment_202412061914_xai_grok-beta_en-US.yaml b/experiments/yival_experiment_archives/experiment_configurations/experiment_202412061914_xai_grok-beta_en-US.yaml
similarity index 100%
rename from automation-api/yival_experiments/experiment_configurations/experiment_202412061914_xai_grok-beta_en-US.yaml
rename to experiments/yival_experiment_archives/experiment_configurations/experiment_202412061914_xai_grok-beta_en-US.yaml