From 791aeb20cb42d994c560c7e811c8ad3f8e0300ce Mon Sep 17 00:00:00 2001 From: haonan-li Date: Mon, 22 Apr 2024 22:38:38 +0400 Subject: [PATCH] add minimal test --- docs/README.md | 1 + docs/development_guide.md | 2 +- script/minimal_test.json | 51 ++++++++++++++++++++++++++++++++++++ script/minimal_test.py | 55 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 script/minimal_test.json create mode 100644 script/minimal_test.py diff --git a/docs/README.md b/docs/README.md index 123b638..f65251d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -17,6 +17,7 @@ We welcome contributions and feedback from the community and recommend a few bes * PRs should be titled descriptively, and be opened with a brief description of the scope and intent of the new contribution. * New features should have appropriate documentation added alongside them. * Aim for code maintainability, and minimize code copying. +* Minimal test are required before submit a PR, run `script/minimal_test.py` and all test cases are required to be passed. ### For Feature Requests diff --git a/docs/development_guide.md b/docs/development_guide.md index 4ac0d0c..0255bee 100644 --- a/docs/development_guide.md +++ b/docs/development_guide.md @@ -44,7 +44,7 @@ The prompt file should contains a class which is a subclass of `BasePrompt` from ### Prompt Optimization -To optimize the prompt for a specific LLM, you can modify the prompt in `factcheck/utils/prompt/`. We will release a minimal test suite to evaluate the performance of the prompt in the future. +To optimize the prompt for a specific LLM, you can modify the prompt in `factcheck/utils/prompt/`. After optimization, you can run our minimal test in `script/minimal_test.py`, you are also welcomed to add more test cases to the minimal test set in `script/minimal_test.json`. diff --git a/script/minimal_test.json b/script/minimal_test.json new file mode 100644 index 0000000..3fec4db --- /dev/null +++ b/script/minimal_test.json @@ -0,0 +1,51 @@ +[ + { + "id": 1, + "response": "I am a famous scientist.", + "attributes": { + "factuality": "Nothing to check." + } + }, + { + "id": 2, + "response": "Steve Jobs is the founder of Apple.", + "attributes": { + "factuality": true + } + }, + { + "id": 3, + "response": "Elon Musk bought Twitter in 2023 and renamed it to X.", + "attributes": { + "factuality": false + } + }, + { + "id": 4, + "response": "Blockchain technology is only used for cryptocurrencies.", + "attributes": { + "factuality": false + } + }, + { + "id": 5, + "response": "Facial recognition technology is infallible and cannot be tricked.", + "attributes": { + "factuality": false + } + }, + { + "id": 6, + "response": "Shanghai Jiao Tong University is one of the top public universities in Guangdong, China", + "attributes": { + "factuality": false + } + }, + { + "id": 7, + "response": "William Yarnel Slack (August 1, 1816 - March 21, 1862) was an American lawyer, politician, and military officer who fought for the Confederate States of America during the American Civil War. Born in Kentucky, Slack moved to Missouri as a child and later entered the legal profession. After serving in the Missouri General Assembly from 1842 to 1843, he fought as a captain in the United States Army for fourteen months during the Mexican–American War, beginning in 1846. He saw action at the Battle of Embudo Pass and the Siege of Pueblo de Taos. Returning to a legal career, Slack became influential in his local area. After the outbreak of the American Civil War in April 1861, Slack, who held pro-slavery views, supported the Confederate cause. When the Missouri State Guard (MSG) was formed the next month to oppose the Union Army, he was appointed as a brigadier general in the MSG's 4th Division. After participating in the Battle of Carthage in July, he fought in the Battle of Wilson's Creek on August 10. After a surprise Union attack, Slack's deployment of his division gave time for further Confederate States Army and MSG troops to deploy. Suffering a bad hip wound at Wilson's Creek, he was unable to rejoin his command until October. Along with other Missouri State Guard officers, Slack transferred to the Confederate States Army in late 1861 where he commanded a brigade with the rank of colonel. On March 7, 1862, during the Battle of Pea Ridge, Slack suffered another wound that was close to the injury he had received at Wilson's Creek. Infection set in, and he died on March 21. He was posthumously promoted to brigadier general in the Confederate army on April 17; the Confederate States Senate might not have known that he was dead at the time of the promotion. (1786)", + "attributes": { + "factuality": false + } + } +] diff --git a/script/minimal_test.py b/script/minimal_test.py new file mode 100644 index 0000000..9c2833d --- /dev/null +++ b/script/minimal_test.py @@ -0,0 +1,55 @@ +import sys +import time +import json +from tqdm import tqdm + +sys.path.append("..") +from factcheck import FactCheck # noqa: E402 + +# ANSI escape codes for colors +green = "\033[92m" +red = "\033[91m" +reset = "\033[0m" + + +def minimal_test(): + # Initialize the FactCheck class + factcheck = FactCheck() + + def atom_test(instance): + response = instance["response"] + res = factcheck.check_response(response) + try: + for k, v in instance["attributes"].items(): + print(f"{k}: {res[k]}, {v}") + assert res[k] == v + return True + except: # noqa E722 + return False + + with open("minimal_test.json") as f: + test_data = json.load(f) + num_tests = len(test_data) + + with tqdm(total=num_tests, position=0) as pbar: + success_count = 0 + fail_count = 0 + for i, test_piece in enumerate(test_data): + result = atom_test(test_piece) + + if result is True: + success_count += 1 + pbar.set_postfix_str("█", refresh=False) + pbar.colour = "green" + else: + fail_count += 1 + pbar.set_postfix_str("█", refresh=False) + pbar.colour = "red" + + pbar.set_description(f"| Success: {success_count}, Failed: {fail_count}", refresh=True) + pbar.update(1) + time.sleep(0.1) # Sleep for 0.1 seconds + + +if __name__ == "__main__": + minimal_test()