From efc88be3fd7bc88fbec020df5eed9a4a01e7ff25 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 12 May 2023 17:27:20 +0000
Subject: [PATCH 01/10] test example draft

---
 test/autogen/test_adv_gen.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 test/autogen/test_adv_gen.py

diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
new file mode 100644
index 0000000000..a46e2bdd2c
--- /dev/null
+++ b/test/autogen/test_adv_gen.py
@@ -0,0 +1,30 @@
+from flaml import oai
+
+
+def test_adv_gen():
+    try:
+        import openai
+    except ImportError:
+        return
+
+    # input_examples = [
+    #     {}, {}
+    # ]
+
+    # adv_examples, metric_change = oai.Completion.generate_adversarial_examples(data=input_example, metric, mode, eval_func, num_examples)
+
+    # adv_examples are like ...
+
+    # metric is changed from ... to ...
+
+
+if __name__ == "__main__":
+    import openai
+
+    openai.api_key_path = "test/openai/key.txt"
+    # if you use Azure OpenAI, comment the above line and uncomment the following lines
+    # openai.api_type = "azure"
+    # openai.api_base = "https://<your_endpoint>.openai.azure.com/"
+    # openai.api_version = "2023-03-15-preview"  # change if necessary
+    # openai.api_key = "<your_api_key>"
+    test_adv_gen()

From b9f8dd6c865377a403ff42fe2f6da90d1be9d79c Mon Sep 17 00:00:00 2001
From: Srinagesh Sharma <srsharm@microsoft.com>
Date: Wed, 24 May 2023 10:56:35 -0400
Subject: [PATCH 02/10] Adding adversarial example api backend with LLM example

---
 flaml/autogen/oai/completion.py | 30 +++++++++++++++++++
 test/autogen/test_adv_gen.py    | 53 ++++++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index 6ec2bd7bd5..4f9d7c9028 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -7,6 +7,7 @@
 import json
 from flaml import tune, BlendSearch
 from flaml.automl.logger import logger_formatter
+import regex as re
 
 try:
     import openai
@@ -1037,6 +1038,35 @@ def stop_logging(cls):
         """End book keeping."""
         cls._history_dict = cls._count_create = None
 
+    def generate_adversarial_examples(
+        self, data, verif_func, eval_func, num_examples=5, reduction='mean'
+    ):
+        base_prompt = 'Generate more complex versions of the input following examples. Make sure that the testing the input would result in the same target as specified. Make sure that the inputs are of the same types that are specified in the examples. Do not replace integers with words.\nexamples:{examples}'
+
+        base_settings = {"max_tokens": 64, "temperature": 1, "top_p": 1, "n": 5,
+                                 "stream": False, "logprobs": None, 'engine': 'gpt-4'}
+        max_iter = 10
+        iter = 0 
+        adv_examples = []
+        while len(adv_examples) < num_examples and iter < max_iter:
+            query = base_settings
+            query['prompt'] = base_prompt.format(examples=str(data))
+            resp = self.create(**query)['choices'][0]['text']
+            adv_candidates = re.findall(r"(?={).*(?<=})", resp)
+            for cand in adv_candidates:
+                candidate = eval(cand)
+                cand_verification = verif_func(candidate)
+                cand_test = eval_func(candidate)
+                if cand_verification and not cand_test:
+                    adv_examples.append(candidate)
+
+        input_data_metric = reduction(eval_func(data))
+        adv_metric = reduction(eval_func(adv_examples))
+
+        return adv_examples, (input_data_metric - adv_metric)
+            
+
+
 
 class ChatCompletion(Completion):
     """A class for OpenAI API ChatCompletion."""
diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
index a46e2bdd2c..17070b204e 100644
--- a/test/autogen/test_adv_gen.py
+++ b/test/autogen/test_adv_gen.py
@@ -1,4 +1,5 @@
 from flaml import oai
+import re
 
 
 def test_adv_gen():
@@ -7,11 +8,55 @@ def test_adv_gen():
     except ImportError:
         return
 
-    # input_examples = [
-    #     {}, {}
-    # ]
+    input_examples = [{"input": "1 + 4 =", "target": "5"},
+                      {"input": "4 + 9 =", "target": "13"},
+                      {"input": "5 + 0 =", "target": "5"},
+                      {"input": "6 + 4 =", "target": "10"},
+                      {"input": "8 + 3 =", "target": "11"},
+                      {"input": "5 + 6 =", "target": "11"},
+                      {"input": "5 + 2 =", "target": "7"},
+                      {"input": "8 + 1 =", "target": "9"},
+                      {"input": "0 + 4 =", "target": "4"},
+                      {"input": "3 + 3 =", "target": "6"},
+                      {"input": "47 + 8 =", "target": "55"},
+                      {"input": "30 + 89 =", "target": "119"},
+                      {"input": "30 + 58 =", "target": "88"},
+                      {"input": "31 + 46 =", "target": "77"},
+                      {"input": "16 + 72 =", "target": "88"},
+                      {"input": "15 + 63 =", "target": "78"},
+                      {"input": "34 + 31 =", "target": "65"},
+                      {"input": "34 + 17 =", "target": "51"},
+                      {"input": "9 + 74 =", "target": "83"},
+                      {"input": "86 + 33 =", "target": "119"},
+                      {"input": "486 + 141 =", "target": "627"},
+                      {"input": "76 + 812 =", "target": "888"},
+                      {"input": "13 + 476 =", "target": "489"},
+                      {"input": "94 + 129 =", "target": "223"},
+                      {"input": "635 + 613 =", "target": "1248"},
+                      {"input": "516 + 962 =", "target": "1478"},
+                      {"input": "749 + 750 =", "target": "1499"},
+                      {"input": "389 + 442 =", "target": "831"},
+                      {"input": "773 + 546 =", "target": "1319"},
+                      {"input": "348 + 227 =", "target": "575"}]
 
-    # adv_examples, metric_change = oai.Completion.generate_adversarial_examples(data=input_example, metric, mode, eval_func, num_examples)
+    def verif_arith(example):
+        lhs = eval(re.findall(r"^(.*?)=", example['input'])[0].strip())
+        rhs = int(example['target'])
+
+        return lhs == rhs
+
+    def test_arith(example):
+        base_prompt='{input}'
+        query = {"max_tokens": 64, "temperature": 0, "top_p": 1, "n": 1, "stream": False, "logprobs": None, 'engine': 'text-davinci-003', 'stop': '\n'}
+        query['prompt'] = base_prompt.format(example['input'])
+        resp = oai.Completion.create(**query)
+
+        return example['target'] == resp['choices'][0]['text']
+
+        
+    adv_examples, metric_change = oai.Completion.generate_adversarial_examples(data=input_examples, verif_func=verif_arith, eval_func=test_arith, num_examples=5, reduction='mean')
+
+    # adv_examples, metric_change = oai.Completion.generate_adversarial_examples(data=input_examples, mode, eval_func, num_examples)
 
     # adv_examples are like ...
 

From fa7fa93b18c6a44f3fbe500bdf330d8e839a92c3 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 26 May 2023 16:11:05 +0000
Subject: [PATCH 03/10] regex not working

---
 flaml/autogen/datagen.py        |  35 +++++++++++
 flaml/autogen/oai/completion.py |  33 ----------
 test/autogen/test_adv_gen.py    | 106 ++++++++++++++++++--------------
 3 files changed, 95 insertions(+), 79 deletions(-)
 create mode 100644 flaml/autogen/datagen.py

diff --git a/flaml/autogen/datagen.py b/flaml/autogen/datagen.py
new file mode 100644
index 0000000000..0830baa3a4
--- /dev/null
+++ b/flaml/autogen/datagen.py
@@ -0,0 +1,35 @@
+import regex as re
+import numpy as np
+from flaml import oai
+
+
+def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, reduction=np.mean, **config):
+    base_prompt = "Generate more complex versions of the input following examples. Make sure that the testing the input would result in the same target as specified. Make sure that the inputs are of the same types that are specified in the examples. Do not replace integers with words.\nexamples:{examples}"
+
+    # base_settings = {
+    #     "max_tokens": 64,
+    #     "temperature": 1,
+    #     "top_p": 1,
+    #     "n": 5,
+    #     "model": "gpt-4",
+    # }
+    max_iter = 10
+    iter = 0
+    adv_examples = []
+    while len(adv_examples) < num_examples and iter < max_iter:
+        # query = base_settings
+        # query["prompt"] = base_prompt.format(examples=str(data))
+        response = oai.Completion.create({"examples": str(data)}, prompt=base_prompt, **config)
+        resp = oai.Completion.extract_text(response)[0]
+        adv_candidates = re.findall(r"(?={).*(?<=})", resp)
+        for cand in adv_candidates:
+            candidate = eval(cand)
+            cand_verification = verif_func(candidate)
+            cand_test = eval_func(candidate, **config)
+            if cand_verification and not cand_test:
+                adv_examples.append(candidate)
+
+    input_data_metric = reduction(eval_func(data, **config))
+    adv_metric = reduction(eval_func(adv_examples, **config))
+
+    return adv_examples, (input_data_metric - adv_metric)
diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index d48b767ef2..f98ccd8a8a 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -5,7 +5,6 @@
 from typing import List, Optional, Dict, Callable, Any
 import sys
 import shutil
-import regex as re
 from flaml import tune, BlendSearch
 from flaml.tune.space import is_constant
 from flaml.automl.logger import logger_formatter
@@ -1054,38 +1053,6 @@ def stop_logging(cls):
         """End book keeping."""
         cls._history_dict = cls._count_create = None
 
-    def generate_adversarial_examples(self, data, verif_func, eval_func, num_examples=5, reduction="mean"):
-        base_prompt = "Generate more complex versions of the input following examples. Make sure that the testing the input would result in the same target as specified. Make sure that the inputs are of the same types that are specified in the examples. Do not replace integers with words.\nexamples:{examples}"
-
-        base_settings = {
-            "max_tokens": 64,
-            "temperature": 1,
-            "top_p": 1,
-            "n": 5,
-            "stream": False,
-            "logprobs": None,
-            "engine": "gpt-4",
-        }
-        max_iter = 10
-        iter = 0
-        adv_examples = []
-        while len(adv_examples) < num_examples and iter < max_iter:
-            query = base_settings
-            query["prompt"] = base_prompt.format(examples=str(data))
-            resp = self.create(**query)["choices"][0]["text"]
-            adv_candidates = re.findall(r"(?={).*(?<=})", resp)
-            for cand in adv_candidates:
-                candidate = eval(cand)
-                cand_verification = verif_func(candidate)
-                cand_test = eval_func(candidate)
-                if cand_verification and not cand_test:
-                    adv_examples.append(candidate)
-
-        input_data_metric = reduction(eval_func(data))
-        adv_metric = reduction(eval_func(adv_examples))
-
-        return adv_examples, (input_data_metric - adv_metric)
-
 
 class ChatCompletion(Completion):
     """A class for OpenAI API ChatCompletion."""
diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
index 17070b204e..519ed4415a 100644
--- a/test/autogen/test_adv_gen.py
+++ b/test/autogen/test_adv_gen.py
@@ -1,6 +1,9 @@
 from flaml import oai
+from flaml.autogen.datagen import generate_adversarial_examples
 import re
 
+KEY_LOC = "test/autogen"
+
 
 def test_adv_gen():
     try:
@@ -8,53 +11,66 @@ def test_adv_gen():
     except ImportError:
         return
 
-    input_examples = [{"input": "1 + 4 =", "target": "5"},
-                      {"input": "4 + 9 =", "target": "13"},
-                      {"input": "5 + 0 =", "target": "5"},
-                      {"input": "6 + 4 =", "target": "10"},
-                      {"input": "8 + 3 =", "target": "11"},
-                      {"input": "5 + 6 =", "target": "11"},
-                      {"input": "5 + 2 =", "target": "7"},
-                      {"input": "8 + 1 =", "target": "9"},
-                      {"input": "0 + 4 =", "target": "4"},
-                      {"input": "3 + 3 =", "target": "6"},
-                      {"input": "47 + 8 =", "target": "55"},
-                      {"input": "30 + 89 =", "target": "119"},
-                      {"input": "30 + 58 =", "target": "88"},
-                      {"input": "31 + 46 =", "target": "77"},
-                      {"input": "16 + 72 =", "target": "88"},
-                      {"input": "15 + 63 =", "target": "78"},
-                      {"input": "34 + 31 =", "target": "65"},
-                      {"input": "34 + 17 =", "target": "51"},
-                      {"input": "9 + 74 =", "target": "83"},
-                      {"input": "86 + 33 =", "target": "119"},
-                      {"input": "486 + 141 =", "target": "627"},
-                      {"input": "76 + 812 =", "target": "888"},
-                      {"input": "13 + 476 =", "target": "489"},
-                      {"input": "94 + 129 =", "target": "223"},
-                      {"input": "635 + 613 =", "target": "1248"},
-                      {"input": "516 + 962 =", "target": "1478"},
-                      {"input": "749 + 750 =", "target": "1499"},
-                      {"input": "389 + 442 =", "target": "831"},
-                      {"input": "773 + 546 =", "target": "1319"},
-                      {"input": "348 + 227 =", "target": "575"}]
+    input_examples = [
+        {"input": "1 + 4 =", "target": "5"},
+        {"input": "4 + 9 =", "target": "13"},
+        {"input": "5 + 0 =", "target": "5"},
+        {"input": "6 + 4 =", "target": "10"},
+        {"input": "8 + 3 =", "target": "11"},
+        {"input": "5 + 6 =", "target": "11"},
+        {"input": "5 + 2 =", "target": "7"},
+        {"input": "8 + 1 =", "target": "9"},
+        {"input": "0 + 4 =", "target": "4"},
+        {"input": "3 + 3 =", "target": "6"},
+        {"input": "47 + 8 =", "target": "55"},
+        {"input": "30 + 89 =", "target": "119"},
+        {"input": "30 + 58 =", "target": "88"},
+        {"input": "31 + 46 =", "target": "77"},
+        {"input": "16 + 72 =", "target": "88"},
+        {"input": "15 + 63 =", "target": "78"},
+        {"input": "34 + 31 =", "target": "65"},
+        {"input": "34 + 17 =", "target": "51"},
+        {"input": "9 + 74 =", "target": "83"},
+        {"input": "86 + 33 =", "target": "119"},
+        {"input": "486 + 141 =", "target": "627"},
+        {"input": "76 + 812 =", "target": "888"},
+        {"input": "13 + 476 =", "target": "489"},
+        {"input": "94 + 129 =", "target": "223"},
+        {"input": "635 + 613 =", "target": "1248"},
+        {"input": "516 + 962 =", "target": "1478"},
+        {"input": "749 + 750 =", "target": "1499"},
+        {"input": "389 + 442 =", "target": "831"},
+        {"input": "773 + 546 =", "target": "1319"},
+        {"input": "348 + 227 =", "target": "575"},
+    ]
 
     def verif_arith(example):
-        lhs = eval(re.findall(r"^(.*?)=", example['input'])[0].strip())
-        rhs = int(example['target'])
+        lhs = eval(re.findall(r"^(.*?)=", example["input"])[0].strip())
+        rhs = int(example["target"])
 
         return lhs == rhs
 
-    def test_arith(example):
-        base_prompt='{input}'
-        query = {"max_tokens": 64, "temperature": 0, "top_p": 1, "n": 1, "stream": False, "logprobs": None, 'engine': 'text-davinci-003', 'stop': '\n'}
-        query['prompt'] = base_prompt.format(example['input'])
-        resp = oai.Completion.create(**query)
+    def test_arith(example, **config):
+        base_prompt = "{input}"
+        # query = {"max_tokens": 64, "temperature": 0, "top_p": 1, "n": 1, "stream": False, "logprobs": None, 'engine': 'text-davinci-003', 'stop': '\n'}
+        # query['prompt'] = base_prompt.format(example['input'])
+        # resp = oai.Completion.create(**query)
+        response = oai.Completion.create(example, prompt=base_prompt, **config)
 
-        return example['target'] == resp['choices'][0]['text']
+        return example["target"] == oai.Completion.extract_text(response)[0]
 
-        
-    adv_examples, metric_change = oai.Completion.generate_adversarial_examples(data=input_examples, verif_func=verif_arith, eval_func=test_arith, num_examples=5, reduction='mean')
+    config_list = oai.config_list_openai_aoai(KEY_LOC)
+    adv_examples, metric_change = generate_adversarial_examples(
+        data=input_examples,
+        verif_func=verif_arith,
+        eval_func=test_arith,
+        num_examples=5,
+        # reduction=np.mean,
+        config_list=config_list,
+        model="gpt-3.5-turbo",
+    )
+    print(adv_examples)
+    print(metric_change)
 
     # adv_examples, metric_change = oai.Completion.generate_adversarial_examples(data=input_examples, mode, eval_func, num_examples)
 
@@ -65,11 +81,9 @@ def test_arith(example):
 
 if __name__ == "__main__":
     import openai
+    import os
 
-    openai.api_key_path = "test/openai/key.txt"
-    # if you use Azure OpenAI, comment the above line and uncomment the following lines
-    # openai.api_type = "azure"
-    # openai.api_base = "https://<your_endpoint>.openai.azure.com/"
-    # openai.api_version = "2023-03-15-preview"  # change if necessary
-    # openai.api_key = "<your_api_key>"
+    config_list = oai.config_list_openai_aoai(KEY_LOC)
+    assert len(config_list) >= 3, config_list
+    openai.api_key = os.environ["OPENAI_API_KEY"]
     test_adv_gen()

From d49a8108e61d5e8e8cc66ce79f9fc02c19eb5d86 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 26 May 2023 16:31:50 +0000
Subject: [PATCH 04/10] debug parsing

---
 flaml/autogen/datagen.py     |  8 ++++----
 test/autogen/test_adv_gen.py | 22 +++++++++++++++-------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/flaml/autogen/datagen.py b/flaml/autogen/datagen.py
index 0830baa3a4..d2801991a0 100644
--- a/flaml/autogen/datagen.py
+++ b/flaml/autogen/datagen.py
@@ -21,15 +21,15 @@ def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, r
         # query["prompt"] = base_prompt.format(examples=str(data))
         response = oai.Completion.create({"examples": str(data)}, prompt=base_prompt, **config)
         resp = oai.Completion.extract_text(response)[0]
-        adv_candidates = re.findall(r"(?={).*(?<=})", resp)
+        adv_candidates = eval(resp.strip())  # re.findall(r"(?={).*(?<=})", resp)
         for cand in adv_candidates:
             candidate = eval(cand)
             cand_verification = verif_func(candidate)
-            cand_test = eval_func(candidate, **config)
+            cand_test = eval_func(candidate)
             if cand_verification and not cand_test:
                 adv_examples.append(candidate)
 
-    input_data_metric = reduction(eval_func(data, **config))
-    adv_metric = reduction(eval_func(adv_examples, **config))
+    input_data_metric = reduction(eval_func(data))
+    adv_metric = reduction(eval_func(adv_examples))
 
     return adv_examples, (input_data_metric - adv_metric)
diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
index 519ed4415a..44fb1d1ba8 100644
--- a/test/autogen/test_adv_gen.py
+++ b/test/autogen/test_adv_gen.py
@@ -50,9 +50,17 @@ def verif_arith(example):
 
         return lhs == rhs
 
-    def test_arith(example, **config):
+    def test_arith(example):
         base_prompt = "{input}"
-        # query = {"max_tokens": 64, "temperature": 0, "top_p": 1, "n": 1, "stream": False, "logprobs": None, 'engine': 'text-davinci-003', 'stop': '\n'}
+        config = {
+            "max_tokens": 64,
+            "temperature": 0,
+            "top_p": 1,
+            "n": 1,
+            "stream": False,
+            "model": "text-davinci-003",
+            "stop": "\n",
+        }
         # query['prompt'] = base_prompt.format(example['input'])
         # resp = oai.Completion.create(**query)
         response = oai.Completion.create(example, prompt=base_prompt, **config)
@@ -80,10 +88,10 @@ def test_arith(example, **config):
 
 
 if __name__ == "__main__":
-    import openai
-    import os
+    # import openai
+    # import os
 
-    config_list = oai.config_list_openai_aoai(KEY_LOC)
-    assert len(config_list) >= 3, config_list
-    openai.api_key = os.environ["OPENAI_API_KEY"]
+    # config_list = oai.config_list_openai_aoai(KEY_LOC)
+    # assert len(config_list) >= 3, config_list
+    # openai.api_key = os.environ["OPENAI_API_KEY"]
     test_adv_gen()

From b765c92daf51ce915364f4fc7354863c460c19bd Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Sat, 27 May 2023 00:25:10 +0000
Subject: [PATCH 05/10] functioning

---
 flaml/autogen/datagen.py     | 12 +++++-------
 test/autogen/test_adv_gen.py | 14 ++++++--------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/flaml/autogen/datagen.py b/flaml/autogen/datagen.py
index d2801991a0..98b9892eb3 100644
--- a/flaml/autogen/datagen.py
+++ b/flaml/autogen/datagen.py
@@ -1,10 +1,11 @@
+import json
 import regex as re
 import numpy as np
 from flaml import oai
 
 
 def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, reduction=np.mean, **config):
-    base_prompt = "Generate more complex versions of the input following examples. Make sure that the testing the input would result in the same target as specified. Make sure that the inputs are of the same types that are specified in the examples. Do not replace integers with words.\nexamples:{examples}"
+    base_prompt = "Generate more complex versions of the following input examples. Make sure that the input would result in the same target as specified. Make sure that the inputs are of the same types that are specified in the examples. Generate parsable json with double quotes. Do not replace integers with words.\nexamples:{examples}"
 
     # base_settings = {
     #     "max_tokens": 64,
@@ -21,15 +22,12 @@ def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, r
         # query["prompt"] = base_prompt.format(examples=str(data))
         response = oai.Completion.create({"examples": str(data)}, prompt=base_prompt, **config)
         resp = oai.Completion.extract_text(response)[0]
-        adv_candidates = eval(resp.strip())  # re.findall(r"(?={).*(?<=})", resp)
+        adv_candidates = json.loads(resp.strip().replace("'", '"'))  # re.findall(r"(?={).*(?<=})", resp)
         for cand in adv_candidates:
-            candidate = eval(cand)
+            candidate = cand
             cand_verification = verif_func(candidate)
             cand_test = eval_func(candidate)
             if cand_verification and not cand_test:
                 adv_examples.append(candidate)
 
-    input_data_metric = reduction(eval_func(data))
-    adv_metric = reduction(eval_func(adv_examples))
-
-    return adv_examples, (input_data_metric - adv_metric)
+    return adv_examples
diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
index 44fb1d1ba8..01a77ea2d5 100644
--- a/test/autogen/test_adv_gen.py
+++ b/test/autogen/test_adv_gen.py
@@ -63,22 +63,20 @@ def test_arith(example):
         }
         # query['prompt'] = base_prompt.format(example['input'])
         # resp = oai.Completion.create(**query)
-        response = oai.Completion.create(example, prompt=base_prompt, **config)
+        response = oai.Completion.create(example, prompt=base_prompt, config_list=config_list_eval, **config)
+        return example["target"] == oai.Completion.extract_text(response)[0].strip()
 
-        return example["target"] == oai.Completion.extract_text(response)[0]
-
-    config_list = oai.config_list_openai_aoai(KEY_LOC)
-    adv_examples, metric_change = generate_adversarial_examples(
+    config_list_adv = oai.config_list_gpt4_gpt35(KEY_LOC)
+    config_list_eval = oai.config_list_openai_aoai(KEY_LOC)
+    adv_examples = generate_adversarial_examples(
         data=input_examples,
         verif_func=verif_arith,
         eval_func=test_arith,
         num_examples=5,
         # reduction=np.mean,
-        config_list=config_list,
-        model="gpt-3.5-turbo",
+        config_list=config_list_adv,
     )
     print(adv_examples)
-    print(metric_change)
 
     # adv_examples, metric_change = oai.Completion.generate_adversarial_examples(data=input_examples, mode, eval_func, num_examples)
 

From 52b13ec1f024ee7388c828d72478001ccd24f431 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Sat, 27 May 2023 00:26:59 +0000
Subject: [PATCH 06/10] functioning

---
 flaml/autogen/datagen.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/flaml/autogen/datagen.py b/flaml/autogen/datagen.py
index 98b9892eb3..916c326e83 100644
--- a/flaml/autogen/datagen.py
+++ b/flaml/autogen/datagen.py
@@ -1,10 +1,8 @@
 import json
-import regex as re
-import numpy as np
 from flaml import oai
 
 
-def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, reduction=np.mean, **config):
+def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, **config):
     base_prompt = "Generate more complex versions of the following input examples. Make sure that the input would result in the same target as specified. Make sure that the inputs are of the same types that are specified in the examples. Generate parsable json with double quotes. Do not replace integers with words.\nexamples:{examples}"
 
     # base_settings = {
@@ -15,9 +13,9 @@ def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, r
     #     "model": "gpt-4",
     # }
     max_iter = 10
-    iter = 0
+    iteration = 0
     adv_examples = []
-    while len(adv_examples) < num_examples and iter < max_iter:
+    while len(adv_examples) < num_examples and iteration < max_iter:
         # query = base_settings
         # query["prompt"] = base_prompt.format(examples=str(data))
         response = oai.Completion.create({"examples": str(data)}, prompt=base_prompt, **config)
@@ -29,5 +27,5 @@ def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, r
             cand_test = eval_func(candidate)
             if cand_verification and not cand_test:
                 adv_examples.append(candidate)
-
+        iteration += 1
     return adv_examples

From 1ec0f7e105462af32001259cacacf74dbda5c666 Mon Sep 17 00:00:00 2001
From: Srinagesh Sharma <srsharm@microsoft.com>
Date: Thu, 1 Jun 2023 19:08:22 -0400
Subject: [PATCH 07/10] Adding Hydra to AdvGen, WikiQA test case

---
 flaml/autogen/datagen.py         |  63 +++++++++--
 test/autogen/configs/config.yaml |  14 +++
 test/autogen/test_adv_gen.py     | 179 +++++++++++++++++++++++--------
 3 files changed, 203 insertions(+), 53 deletions(-)
 create mode 100644 test/autogen/configs/config.yaml

diff --git a/flaml/autogen/datagen.py b/flaml/autogen/datagen.py
index 916c326e83..145c91db1a 100644
--- a/flaml/autogen/datagen.py
+++ b/flaml/autogen/datagen.py
@@ -1,9 +1,25 @@
 import json
 from flaml import oai
+import regex as re
+from itertools import compress
+import time
+import logging
 
+logger = logging.getLogger(__name__)
 
 def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, **config):
-    base_prompt = "Generate more complex versions of the following input examples. Make sure that the input would result in the same target as specified. Make sure that the inputs are of the same types that are specified in the examples. Generate parsable json with double quotes. Do not replace integers with words.\nexamples:{examples}"
+    base_prompt = """ 
+    # Instructions
+    - Generate adversarial versions of the examples in the following task.
+    - Make sure that the input would result in the same target as specified. 
+    - Make sure that the inputs are of the same types that are specified in the examples. 
+    - Generate parsable json with double quotes. 
+    - Do not replace integers with words.
+    <|start|>(example)
+    {example}
+    <|end|>
+    <|start|>(answer)
+    """
 
     # base_settings = {
     #     "max_tokens": 64,
@@ -15,17 +31,42 @@ def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, *
     max_iter = 10
     iteration = 0
     adv_examples = []
+
+    def group_check(candidate): # replace with loss function
+        verif = verif_func(candidate)
+        cand_test = eval_func(candidate)
+        return verif and not cand_test
+
     while len(adv_examples) < num_examples and iteration < max_iter:
         # query = base_settings
         # query["prompt"] = base_prompt.format(examples=str(data))
-        response = oai.Completion.create({"examples": str(data)}, prompt=base_prompt, **config)
-        resp = oai.Completion.extract_text(response)[0]
-        adv_candidates = json.loads(resp.strip().replace("'", '"'))  # re.findall(r"(?={).*(?<=})", resp)
-        for cand in adv_candidates:
-            candidate = cand
-            cand_verification = verif_func(candidate)
-            cand_test = eval_func(candidate)
-            if cand_verification and not cand_test:
-                adv_examples.append(candidate)
-        iteration += 1
+        # time.sleep(62)
+        response = oai.Completion.create({"example": str(data)}, prompt=base_prompt, **config)
+        resp_candidates = re.findall(r"(?={).*(?<=})", oai.Completion.extract_text(response)[0])
+        adv_candidates = list(map(eval, resp_candidates))
+        eval_candidates = list(map(group_check, adv_candidates))
+        valid_candidates = list(compress(adv_candidates, eval_candidates))
+        if len(valid_candidates) > 0:
+            adv_examples.append(valid_candidates)
+            iteration = 0
+        else:
+            iteration += 1
+
     return adv_examples
+
+
+# base_prompt = """
+    # <|meta_start|>
+    # # Introduction
+    # - You are an adversarial example generation assistant
+    # - Your goal is to generate more complex versions of the examples in the following task. 
+    # - Make sure that the input would result in the same target as specified. 
+    # - Make sure that the inputs are of the same types that are specified in the examples. 
+    # - Generate parsable json with double quotes. 
+    # - Do not replace integers with words.
+    # <|meta_end|>
+    # <|start|>(example)
+    # {examples}
+    # <|end|>
+    # <|start|>(answer)
+    # """
\ No newline at end of file
diff --git a/test/autogen/configs/config.yaml b/test/autogen/configs/config.yaml
new file mode 100644
index 0000000000..62b687f8fc
--- /dev/null
+++ b/test/autogen/configs/config.yaml
@@ -0,0 +1,14 @@
+hydra:
+  job:
+    chdir: false
+
+openai:
+  key_path: <key-path>
+  adv:
+    model: <adversarial-gen-model>
+    # api_base:
+    # Other override arguments for adv
+  eval:
+    model: 'text-davinci-003'
+    # api_base: 
+    # other override args
\ No newline at end of file
diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
index 01a77ea2d5..4212dc9dbe 100644
--- a/test/autogen/test_adv_gen.py
+++ b/test/autogen/test_adv_gen.py
@@ -1,56 +1,64 @@
 from flaml import oai
 from flaml.autogen.datagen import generate_adversarial_examples
 import re
+import logging
+import hydra
+import wikipedia
 
 KEY_LOC = "test/autogen"
+logger = logging.getLogger(__name__)
 
-
-def test_adv_gen():
+@hydra.main(config_path="configs", config_name="config")
+def test_adv_gen(cfg):
     try:
         import openai
     except ImportError:
         return
 
+    # config_list_adv = oai.config_list_gpt4_gpt35(KEY_LOC)
+    config_list_adv = oai.config_list_openai_aoai(KEY_LOC)
+    config_list_adv[0].update(cfg.openai.adv)
+    config_list_eval = oai.config_list_openai_aoai(KEY_LOC)
+    config_list_eval[0].update(cfg.openai.eval)
+
+    test_cases = [# SimpleArith(config_list=config_list_eval)
+        WikipediaQGen(config_list=config_list_eval)
+        ]
+
+    for case in test_cases:
+        adv_examples = generate_adversarial_examples(
+            data=case.input_examples,
+            verif_func=case.verif_func,
+            eval_func=case.test_func,
+            num_examples=5,
+            # reduction=np.mean,
+            config_list=config_list_adv,
+        )
+        print(adv_examples)
+
+class SimpleArith:
     input_examples = [
         {"input": "1 + 4 =", "target": "5"},
         {"input": "4 + 9 =", "target": "13"},
-        {"input": "5 + 0 =", "target": "5"},
-        {"input": "6 + 4 =", "target": "10"},
         {"input": "8 + 3 =", "target": "11"},
-        {"input": "5 + 6 =", "target": "11"},
-        {"input": "5 + 2 =", "target": "7"},
-        {"input": "8 + 1 =", "target": "9"},
-        {"input": "0 + 4 =", "target": "4"},
-        {"input": "3 + 3 =", "target": "6"},
-        {"input": "47 + 8 =", "target": "55"},
         {"input": "30 + 89 =", "target": "119"},
-        {"input": "30 + 58 =", "target": "88"},
-        {"input": "31 + 46 =", "target": "77"},
-        {"input": "16 + 72 =", "target": "88"},
-        {"input": "15 + 63 =", "target": "78"},
-        {"input": "34 + 31 =", "target": "65"},
-        {"input": "34 + 17 =", "target": "51"},
-        {"input": "9 + 74 =", "target": "83"},
-        {"input": "86 + 33 =", "target": "119"},
         {"input": "486 + 141 =", "target": "627"},
-        {"input": "76 + 812 =", "target": "888"},
         {"input": "13 + 476 =", "target": "489"},
-        {"input": "94 + 129 =", "target": "223"},
-        {"input": "635 + 613 =", "target": "1248"},
-        {"input": "516 + 962 =", "target": "1478"},
-        {"input": "749 + 750 =", "target": "1499"},
-        {"input": "389 + 442 =", "target": "831"},
         {"input": "773 + 546 =", "target": "1319"},
         {"input": "348 + 227 =", "target": "575"},
     ]
 
-    def verif_arith(example):
+    def __init__(self, config_list):
+        self.config_list = config_list
+
+    @staticmethod
+    def verif_func(example):
         lhs = eval(re.findall(r"^(.*?)=", example["input"])[0].strip())
         rhs = int(example["target"])
 
         return lhs == rhs
 
-    def test_arith(example):
+    def test_func(self, example):
         base_prompt = "{input}"
         config = {
             "max_tokens": 64,
@@ -59,30 +67,117 @@ def test_arith(example):
             "n": 1,
             "stream": False,
             "model": "text-davinci-003",
-            "stop": "\n",
         }
         # query['prompt'] = base_prompt.format(example['input'])
         # resp = oai.Completion.create(**query)
-        response = oai.Completion.create(example, prompt=base_prompt, config_list=config_list_eval, **config)
+        response = oai.Completion.create(example, prompt=base_prompt, config_list=self.config_list, **config)
         return example["target"] == oai.Completion.extract_text(response)[0].strip()
 
-    config_list_adv = oai.config_list_gpt4_gpt35(KEY_LOC)
-    config_list_eval = oai.config_list_openai_aoai(KEY_LOC)
-    adv_examples = generate_adversarial_examples(
-        data=input_examples,
-        verif_func=verif_arith,
-        eval_func=test_arith,
-        num_examples=5,
-        # reduction=np.mean,
-        config_list=config_list_adv,
-    )
-    print(adv_examples)
 
-    # adv_examples, metric_change = oai.Completion.generate_adversarial_examples(data=input_examples, mode, eval_func, num_examples)
+class WikipediaQGen:
+    def __init__(self, config_list, search_term='Cornell University'):
+        self.config_list = config_list
+        r = wikipedia.search(search_term)
+        page = wikipedia.page(r[0])
+        self.title = page.title
+        self.content = page.content
+        example_gen_prompt = f"""<|im_start|>system
+You are a question generating assistant. Your objective is to take some context and generate questions together with their corresponding answer or possible answers
+<|im_end|>
+<|im_start|>user
+Context
+---
+# 
+{page.title}
+
+{page.content}
+<|im_end|>
+<|im_start|>user
+Generate a series of questions related to {page.title} as follows.
+
+1. Mode = "paragraph"
+
+Write a question for which the answer is a short paragraph.
+
+2. Mode = "few-words"
 
-    # adv_examples are like ...
+The answer is at most a few words.
+
+3. Mode = "number"
+
+The answer is a number.
+
+4. Mode = "bool"
+
+Generate a question with a True/False answer.
+
+For each question above, provide the corresponding correct answer. If there is more than one correct answer, provide a list of all possible answers.
+<|im_end|>
+<|im_start|>assistant
+"""
+        config = {
+            "max_tokens": 512,
+            "temperature": 0.7,
+            "top_p": 1,
+            "n": 1,
+            "stream": False,
+            "model": "text-davinci-003",
+        }
+
+        response = oai.Completion.create(prompt=example_gen_prompt, config_list=self.config_list, **config)
+        answer = oai.Completion.extract_text(response)[0].strip()
+        # find qa
+        qa_parsed = re.findall(r"(?=Question:)[\s\S]*?(?=[0-9]. Mode|$)", response)
+        self.input_examples = []
+        for qa in qa_parsed:
+            example = {"input":re.findall(r"(?<=Question:)[\s\S]*?(?=Answer:)", qa)[0].strip(), 
+                       "target":re.findall(r"(?<=Answer:)", qa)[0].strip()}
+            self.input_examples.append(example)
+
+
+    def add_message(self, content, role="user"):
+        self.messages.append({"role": role, "content": content})
+
+    def verif_func(self, example):
+        base_prompt = """Respond with Yes or No, does the text below answer the question provided?
+        Question: {input}
+        Text: {target}
+        Answer:
+        """
+        config = {
+            "max_tokens": 512,
+            "temperature": 0,
+            "top_p": 1,
+            "n": 1,
+            "stream": False,
+            "model": "text-davinci-003",
+        }
+        response = oai.Completion.create(example, prompt=base_prompt, config_list=self.config_list, **config)
+        answer = oai.Completion.extract_text(response)[0].strip()
+        return answer == 'Yes'
+
+    def test_func(self, example):
+        base_prompt = f"""Answer the following question based on the context provided.
+        Question:
+        {{input}}
+        Context:
+        {self.title}
+        {self.content}
+        Answer:
+        """
+        config = {
+            "max_tokens": 512,
+            "temperature": 0,
+            "top_p": 1,
+            "n": 1,
+            "stream": False,
+            "model": "text-davinci-003",
+        }
+        response = oai.Completion.create(example, prompt=base_prompt, config_list=self.config_list, **config)
+        answer = oai.Completion.extract_text(response)[0]
+        pred_example = {"input": example["input"], "target": answer}
+        return self.verif_func(pred_example)
 
-    # metric is changed from ... to ...
 
 
 if __name__ == "__main__":

From 336be46197e9bd67deaa50fa1f8c5fec53e69c47 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Fri, 2 Jun 2023 16:21:15 +0000
Subject: [PATCH 08/10] fix some bug

---
 test/autogen/test_adv_gen.py | 66 +++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
index 4212dc9dbe..dc1ab5b2bb 100644
--- a/test/autogen/test_adv_gen.py
+++ b/test/autogen/test_adv_gen.py
@@ -8,6 +8,7 @@
 KEY_LOC = "test/autogen"
 logger = logging.getLogger(__name__)
 
+
 @hydra.main(config_path="configs", config_name="config")
 def test_adv_gen(cfg):
     try:
@@ -16,14 +17,19 @@ def test_adv_gen(cfg):
         return
 
     # config_list_adv = oai.config_list_gpt4_gpt35(KEY_LOC)
-    config_list_adv = oai.config_list_openai_aoai(KEY_LOC)
-    config_list_adv[0].update(cfg.openai.adv)
+    config_list_adv = oai.config_list_openai_aoai(KEY_LOC)[1:]
+    # config_list_adv[0].update(cfg.openai.adv)
     config_list_eval = oai.config_list_openai_aoai(KEY_LOC)
-    config_list_eval[0].update(cfg.openai.eval)
-
-    test_cases = [# SimpleArith(config_list=config_list_eval)
-        WikipediaQGen(config_list=config_list_eval)
-        ]
+    # config_list_eval[0].update(cfg.openai.eval)
+
+    test_cases = [  # SimpleArith(config_list=config_list_eval)
+        WikipediaQGen(
+            config_list_adv=config_list_adv,
+            config_adv=cfg.openai.adv,
+            config_list_eval=config_list_eval,
+            config_eval=cfg.openai.eval,
+        )
+    ]
 
     for case in test_cases:
         adv_examples = generate_adversarial_examples(
@@ -33,9 +39,11 @@ def test_adv_gen(cfg):
             num_examples=5,
             # reduction=np.mean,
             config_list=config_list_adv,
+            **cfg.openai.adv,
         )
         print(adv_examples)
 
+
 class SimpleArith:
     input_examples = [
         {"input": "1 + 4 =", "target": "5"},
@@ -75,8 +83,13 @@ def test_func(self, example):
 
 
 class WikipediaQGen:
-    def __init__(self, config_list, search_term='Cornell University'):
-        self.config_list = config_list
+    def __init__(
+        self, config_list_adv={}, search_term="Cornell University", config_eval={}, config_adv={}, config_list_eval={}
+    ):
+        self.config_list_adv = config_list_adv
+        self.config_list_eval = config_list_eval
+        self.config_eval = config_eval
+        self.config_adv = config_adv
         r = wikipedia.search(search_term)
         page = wikipedia.page(r[0])
         self.title = page.title
@@ -87,7 +100,7 @@ def __init__(self, config_list, search_term='Cornell University'):
 <|im_start|>user
 Context
 ---
-# 
+#
 {page.title}
 
 {page.content}
@@ -120,25 +133,25 @@ def __init__(self, config_list, search_term='Cornell University'):
             "temperature": 0.7,
             "top_p": 1,
             "n": 1,
-            "stream": False,
-            "model": "text-davinci-003",
+            "model": "gpt-4-32k",
         }
-
-        response = oai.Completion.create(prompt=example_gen_prompt, config_list=self.config_list, **config)
+        response = oai.Completion.create(prompt=example_gen_prompt, config_list=self.config_list_adv, **config)
         answer = oai.Completion.extract_text(response)[0].strip()
         # find qa
-        qa_parsed = re.findall(r"(?=Question:)[\s\S]*?(?=[0-9]. Mode|$)", response)
+        qa_parsed = re.findall(r"(?=Question:)[\s\S]*?(?=[0-9]. Mode|$)", answer)
         self.input_examples = []
         for qa in qa_parsed:
-            example = {"input":re.findall(r"(?<=Question:)[\s\S]*?(?=Answer:)", qa)[0].strip(), 
-                       "target":re.findall(r"(?<=Answer:)", qa)[0].strip()}
+            example = {
+                "input": re.findall(r"(?<=Question:)[\s\S]*?(?=Answer:)", qa)[0].strip(),
+                "target": re.findall(r"(?<=Answer:)", qa)[0].strip(),
+            }
             self.input_examples.append(example)
 
-
-    def add_message(self, content, role="user"):
-        self.messages.append({"role": role, "content": content})
+    # def add_message(self, content, role="user"):
+    #     self.messages.append({"role": role, "content": content})
 
     def verif_func(self, example):
+        print(example)
         base_prompt = """Respond with Yes or No, does the text below answer the question provided?
         Question: {input}
         Text: {target}
@@ -149,12 +162,11 @@ def verif_func(self, example):
             "temperature": 0,
             "top_p": 1,
             "n": 1,
-            "stream": False,
-            "model": "text-davinci-003",
+            **self.config_adv,
         }
-        response = oai.Completion.create(example, prompt=base_prompt, config_list=self.config_list, **config)
+        response = oai.Completion.create(example, prompt=base_prompt, config_list=self.config_list_adv, **config)
         answer = oai.Completion.extract_text(response)[0].strip()
-        return answer == 'Yes'
+        return answer == "Yes"
 
     def test_func(self, example):
         base_prompt = f"""Answer the following question based on the context provided.
@@ -170,16 +182,14 @@ def test_func(self, example):
             "temperature": 0,
             "top_p": 1,
             "n": 1,
-            "stream": False,
-            "model": "text-davinci-003",
+            **self.config_eval,
         }
-        response = oai.Completion.create(example, prompt=base_prompt, config_list=self.config_list, **config)
+        response = oai.Completion.create(example, prompt=base_prompt, config_list=self.config_list_eval, **config)
         answer = oai.Completion.extract_text(response)[0]
         pred_example = {"input": example["input"], "target": answer}
         return self.verif_func(pred_example)
 
 
-
 if __name__ == "__main__":
     # import openai
     # import os

From 7ff01defb90b69cbe5838bdd1d6d0a949198352f Mon Sep 17 00:00:00 2001
From: Srinagesh Sharma <srsharm@microsoft.com>
Date: Thu, 15 Jun 2023 12:00:01 -0400
Subject: [PATCH 09/10] Adding programmatic baseline with modified eval

---
 flaml/autogen/datagen.py     | 37 ++++++++++++++++-------------
 test/autogen/test_adv_gen.py | 46 ++++++++++++++++++++----------------
 2 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/flaml/autogen/datagen.py b/flaml/autogen/datagen.py
index 145c91db1a..5e0e7d9e1e 100644
--- a/flaml/autogen/datagen.py
+++ b/flaml/autogen/datagen.py
@@ -7,18 +7,18 @@
 
 logger = logging.getLogger(__name__)
 
-def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, **config):
+def generate_adversarial_examples(data, test_func, eval_func, num_examples=5, **config):
     base_prompt = """ 
-    # Instructions
-    - Generate adversarial versions of the examples in the following task.
-    - Make sure that the input would result in the same target as specified. 
-    - Make sure that the inputs are of the same types that are specified in the examples. 
-    - Generate parsable json with double quotes. 
-    - Do not replace integers with words.
-    <|start|>(example)
-    {example}
-    <|end|>
-    <|start|>(answer)
+# Instructions
+- Generate a complex version of the example in the following task.
+- Make sure that the inputs are of the same types that are specified in the examples. 
+- Generate a json with double quotes. 
+- Do not replace integers with words.
+- For mathematical examples use programmatic syntax. For example, use '*' instead of 'x' for multiplication
+<|start|>(example)
+{example}
+<|end|>
+<|start|>(answer)
     """
 
     # base_settings = {
@@ -33,17 +33,20 @@ def generate_adversarial_examples(data, verif_func, eval_func, num_examples=5, *
     adv_examples = []
 
     def group_check(candidate): # replace with loss function
-        verif = verif_func(candidate)
-        cand_test = eval_func(candidate)
-        return verif and not cand_test
+        eval_cands = eval_func(candidate)
+        test_cands = test_func(candidate, eval_cands)
+        return not test_cands
 
+    ii = 0
     while len(adv_examples) < num_examples and iteration < max_iter:
         # query = base_settings
         # query["prompt"] = base_prompt.format(examples=str(data))
-        # time.sleep(62)
-        response = oai.Completion.create({"example": str(data)}, prompt=base_prompt, **config)
+        print(f"iteration={iteration}")
+        sample = data[ii % len(data)]
+        response = oai.Completion.create({"example": sample}, prompt=base_prompt, **config)
         resp_candidates = re.findall(r"(?={).*(?<=})", oai.Completion.extract_text(response)[0])
         adv_candidates = list(map(eval, resp_candidates))
+        time.sleep(17)
         eval_candidates = list(map(group_check, adv_candidates))
         valid_candidates = list(compress(adv_candidates, eval_candidates))
         if len(valid_candidates) > 0:
@@ -51,6 +54,8 @@ def group_check(candidate): # replace with loss function
             iteration = 0
         else:
             iteration += 1
+        time.sleep(17)
+        ii += 1
 
     return adv_examples
 
diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
index dc1ab5b2bb..53522710c0 100644
--- a/test/autogen/test_adv_gen.py
+++ b/test/autogen/test_adv_gen.py
@@ -5,11 +5,11 @@
 import hydra
 import wikipedia
 
-KEY_LOC = "test/autogen"
+KEY_LOC = "./test/autogen"
 logger = logging.getLogger(__name__)
 
 
-@hydra.main(config_path="configs", config_name="config")
+@hydra.main(config_path="configs", config_name="config-srsharm")
 def test_adv_gen(cfg):
     try:
         import openai
@@ -17,25 +17,25 @@ def test_adv_gen(cfg):
         return
 
     # config_list_adv = oai.config_list_gpt4_gpt35(KEY_LOC)
-    config_list_adv = oai.config_list_openai_aoai(KEY_LOC)[1:]
-    # config_list_adv[0].update(cfg.openai.adv)
+    config_list_adv = oai.config_list_openai_aoai(KEY_LOC) # [1:]
+    config_list_adv[0].update(cfg.openai.adv)
     config_list_eval = oai.config_list_openai_aoai(KEY_LOC)
-    # config_list_eval[0].update(cfg.openai.eval)
-
-    test_cases = [  # SimpleArith(config_list=config_list_eval)
-        WikipediaQGen(
-            config_list_adv=config_list_adv,
-            config_adv=cfg.openai.adv,
-            config_list_eval=config_list_eval,
-            config_eval=cfg.openai.eval,
-        )
+    config_list_eval[0].update(cfg.openai.eval)
+
+    test_cases = [  SimpleArith(config_list=config_list_eval)
+        # WikipediaQGen(
+        #     config_list_adv=config_list_adv,
+        #     config_adv=cfg.openai.adv,
+        #     config_list_eval=config_list_eval,
+        #     config_eval=cfg.openai.eval,
+        # )
     ]
 
     for case in test_cases:
         adv_examples = generate_adversarial_examples(
             data=case.input_examples,
-            verif_func=case.verif_func,
-            eval_func=case.test_func,
+            test_func=case.test_func,
+            eval_func=case.eval_func,
             num_examples=5,
             # reduction=np.mean,
             config_list=config_list_adv,
@@ -60,26 +60,32 @@ def __init__(self, config_list):
         self.config_list = config_list
 
     @staticmethod
-    def verif_func(example):
+    def test_func(example, eval_out):
         lhs = eval(re.findall(r"^(.*?)=", example["input"])[0].strip())
-        rhs = int(example["target"])
+        try:
+            rhs = int(eval_out)
+        except:
+            rhs = 0
+
+        logger.info(f"example={example}, llm_response={eval_out}")
 
         return lhs == rhs
 
-    def test_func(self, example):
+    def eval_func(self, example):
         base_prompt = "{input}"
         config = {
-            "max_tokens": 64,
+            "max_tokens": 5,
             "temperature": 0,
             "top_p": 1,
             "n": 1,
             "stream": False,
             "model": "text-davinci-003",
+            "stop": "\n"
         }
         # query['prompt'] = base_prompt.format(example['input'])
         # resp = oai.Completion.create(**query)
         response = oai.Completion.create(example, prompt=base_prompt, config_list=self.config_list, **config)
-        return example["target"] == oai.Completion.extract_text(response)[0].strip()
+        return oai.Completion.extract_text(response)[0].strip()
 
 
 class WikipediaQGen:

From 9f7e914e6456a6d2a6309f291bafb7f6bce23587 Mon Sep 17 00:00:00 2001
From: Srinagesh Sharma <srsharm@microsoft.com>
Date: Thu, 15 Jun 2023 13:53:38 -0400
Subject: [PATCH 10/10] Fixes to manage inconsistent llm outputs

---
 flaml/autogen/datagen.py     | 24 +++++++++++++-----------
 test/autogen/test_adv_gen.py | 15 ++++++++-------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/flaml/autogen/datagen.py b/flaml/autogen/datagen.py
index 5e0e7d9e1e..d83bf5c68c 100644
--- a/flaml/autogen/datagen.py
+++ b/flaml/autogen/datagen.py
@@ -12,6 +12,7 @@ def generate_adversarial_examples(data, test_func, eval_func, num_examples=5, **
 # Instructions
 - Generate a complex version of the example in the following task.
 - Make sure that the inputs are of the same types that are specified in the examples. 
+- Maintain the same format as the input examples, but feel free to be creative within that.
 - Generate a json with double quotes. 
 - Do not replace integers with words.
 - For mathematical examples use programmatic syntax. For example, use '*' instead of 'x' for multiplication
@@ -35,7 +36,7 @@ def generate_adversarial_examples(data, test_func, eval_func, num_examples=5, **
     def group_check(candidate): # replace with loss function
         eval_cands = eval_func(candidate)
         test_cands = test_func(candidate, eval_cands)
-        return not test_cands
+        return (test_cands == 0)
 
     ii = 0
     while len(adv_examples) < num_examples and iteration < max_iter:
@@ -45,16 +46,17 @@ def group_check(candidate): # replace with loss function
         sample = data[ii % len(data)]
         response = oai.Completion.create({"example": sample}, prompt=base_prompt, **config)
         resp_candidates = re.findall(r"(?={).*(?<=})", oai.Completion.extract_text(response)[0])
-        adv_candidates = list(map(eval, resp_candidates))
-        time.sleep(17)
-        eval_candidates = list(map(group_check, adv_candidates))
-        valid_candidates = list(compress(adv_candidates, eval_candidates))
-        if len(valid_candidates) > 0:
-            adv_examples.append(valid_candidates)
-            iteration = 0
-        else:
-            iteration += 1
-        time.sleep(17)
+        if len(resp_candidates) > 0:
+            adv_candidates = list(map(eval, resp_candidates))
+            time.sleep(30)
+            eval_candidates = list(map(group_check, adv_candidates))
+            valid_candidates = list(compress(adv_candidates, eval_candidates))
+            if len(valid_candidates) > 0:
+                adv_examples.append(valid_candidates)
+                iteration = 0
+            else:
+                iteration += 1
+        time.sleep(30)
         ii += 1
 
     return adv_examples
diff --git a/test/autogen/test_adv_gen.py b/test/autogen/test_adv_gen.py
index 53522710c0..be9edd39ca 100644
--- a/test/autogen/test_adv_gen.py
+++ b/test/autogen/test_adv_gen.py
@@ -61,15 +61,17 @@ def __init__(self, config_list):
 
     @staticmethod
     def test_func(example, eval_out):
-        lhs = eval(re.findall(r"^(.*?)=", example["input"])[0].strip())
+        logger.info(f"example input = {example['input']}")
         try:
-            rhs = int(eval_out)
+            lhs = eval(re.findall(r"^(.*?)=", example["input"])[0].strip())
+            logger.info(f"example={example}, llm_response={eval_out}")
+            rhs = float(eval_out)
+            return lhs == rhs
         except:
-            rhs = 0
+            logger.info('eval was unsuccessful due to errors')
+            return -1
 
-        logger.info(f"example={example}, llm_response={eval_out}")
 
-        return lhs == rhs
 
     def eval_func(self, example):
         base_prompt = "{input}"
@@ -79,8 +81,7 @@ def eval_func(self, example):
             "top_p": 1,
             "n": 1,
             "stream": False,
-            "model": "text-davinci-003",
-            "stop": "\n"
+            "model": "text-davinci-003"
         }
         # query['prompt'] = base_prompt.format(example['input'])
         # resp = oai.Completion.create(**query)