From a05cf83fbc7c70a44d416bc88d2f92fb0c4d5443 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 14 Sep 2023 08:32:38 +0000 Subject: [PATCH] update processing --- .../lmsys_chat_1m/final_post_processing.py | 2 +- .../lmsys_chat_1m/process_all.sh | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py index 96fd526ae..8d47f6bf0 100644 --- a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py +++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/final_post_processing.py @@ -19,7 +19,7 @@ del c["tstamp"] del c["user_id"] - np.random.seed(42) + np.random.seed(44) np.random.shuffle(convs) convs = convs[:args.number] diff --git a/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh new file mode 100644 index 000000000..153b2c007 --- /dev/null +++ b/fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/process_all.sh @@ -0,0 +1,17 @@ +export BASE=clean_conv_20230809_1.5M_pii +export SCALE=1 + +# filter words +python3 filter_bad_conv.py --in $BASE.json + +# Clean up some fileds (e.g., timestamps) +python3 final_post_processing.py --in $BASE.s1.json + +# upload to hf +python3 upload_hf_dataset.py --in $BASE.s1.s2.json + +# Make another version with openai moderation tag +python3 merge_oai_tag.py --in $BASE.s1.s2.json + +# Make visualizations +python3 compute_stats.py --in $BASE.s1.json --scale $SCALE