dataset_loader.py

import random
import datasets
import tqdm
import pandas as pd
import re
import json
import numpy as np
import random
import os

# you can add more datasets here and write your own dataset parsing function
DATASETS = ['TruthfulQA', 'SQuAD1', 'NarrativeQA', 'Mixcase', 'All']


def process_spaces(text):
    return text.replace(
        ' ,', ',').replace(
        ' .', '.').replace(
        ' ?', '?').replace(
        ' !', '!').replace(
        ' ;', ';').replace(
        ' \'', '\'').replace(
        ' ’ ', '\'').replace(
        ' :', ':').replace(
        '<newline>', '\n').replace(
        '`` ', '"').replace(
        ' \'\'', '"').replace(
        '\'\'', '"').replace(
        '.. ', '... ').replace(
        ' )', ')').replace(
        '( ', '(').replace(
        ' n\'t', 'n\'t').replace(
        ' i ', ' I ').replace(
        ' i\'', ' I\'').replace(
        '\\\'', '\'').replace(
        '\n ', '\n').strip()


def process_text_truthfulqa_adv(text):
    if "I am sorry" in text:
        first_period = text.index('.')
        start_idx = first_period + 2
        text = text[start_idx:]
    if "as an AI language model" in text or "As an AI language model" in text:
        first_period = text.index('.')
        start_idx = first_period + 2
        text = text[start_idx:]
    return text

def process_dataset(dataset_name, MGT_only_GPT, data_new):
    """
    Process the dataset by loading the data, appending it to the existing data, and returning the count of HWT and MGT data.

    Args:
        dataset_name (str): The name of the dataset.
        MGT_only_GPT (bool): Flag indicating whether to load only MGT generated by GPT-family.
        data_new (dict): The dictionary containing the existing data.

    Returns:
        tuple: A tuple containing the count of HWT data and MGT data.

    """
    LLM_data, data_hwt_cnt, data_mgt_cnt = load_dataset(dataset_name, MGT_only_GPT)
    data_new['train']['text'] += LLM_data['text']
    data_new['train']['label'] += LLM_data['label']
    return data_hwt_cnt, data_mgt_cnt

def load_All(filename1, filename2: str = None, MGT_only_GPT: bool = False, train_threshold: int = 10000, mixcase_threshold: float = 0.8, test_only: bool = False, no_auc: bool = False, train_with_mixcase: bool = False, seed: int = 0, three_classes: bool = False, mixcase_as_mgt:bool=False):
    """
    Load and process datasets for training and testing.

    Args:
        filename1 (str): The filename of the first dataset.
        filename2 (str, optional): The filename of the second dataset. Defaults to None.
        MGT_only_GPT (bool, optional): Flag indicating whether to use MGT generated by GPT-family. Defaults to False.
        train_threshold (int, optional): The threshold for the number of training samples. Defaults to 10000.
        mixcase_threshold (float, optional): The threshold for mixcase data. Defaults to 0.8.
        test_only (bool, optional): Flag indicating whether to load test data only. Defaults to False.
        no_auc (bool, optional): Flag indicating whether to exclude AUC calculation. Defaults to False.
        train_with_mixcase (bool, optional): Flag indicating whether to train with mixcase data. Defaults to False.
        seed (int, optional): The random seed. Defaults to 0.
        three_classes (bool, optional): Flag indicating whether to use three classes. Defaults to False.
        mixcase_as_mgt (bool, optional): Flag indicating whether to treat mixcase data as MGT. Defaults to False.

    Returns:
        dict: A dictionary containing the loaded and processed data.
    """
    data_new = {
        'train': {
            'text': [],
            'label': [],
        },
        'test': {
            'text': [],
            'label': [],
        }
    }
    hwt_cnt, mgt_cnt = 0, 0

    # Process each dataset
    for dataset in ['SQuAD1', 'TruthfulQA', 'NarrativeQA']:
        data_hwt_cnt, data_mgt_cnt = process_dataset(dataset, MGT_only_GPT, data_new)
        hwt_cnt += data_hwt_cnt
        mgt_cnt += data_mgt_cnt
        
    # Process HWT data
    HWT_data = load_HWT(mgt_cnt - hwt_cnt)
    data_new['train']['text'] += HWT_data['text']
    data_new['train']['label'] += HWT_data['label'].tolist()

    # Shuffle and trim data
    new_text, new_label = shuffle_and_trim_data(data_new, seed, train_threshold)
    data_new['train']['text'] = new_text
    data_new['train']['label'] = new_label

    # Process Mixcase data
    if MGT_only_GPT:
        mixcase_data = process_mixcase_data(filename1, filename2, no_auc, mixcase_threshold, train_with_mixcase = True, seed=0, three_classes=False, data_new=data_new, test_only=test_only, mgt_only_gpt=MGT_only_GPT, mixcase_as_mgt=mixcase_as_mgt)
        data_new['test']['text'] += mixcase_data['test']['text']
        data_new['test']['label'] += mixcase_data['test']['label']
        if train_with_mixcase:
            data_new['train']['text'] += mixcase_data['train']['text']
            data_new['train']['label'] += mixcase_data['train']['label']
    else:
        mixcase_data = process_mixcase_data(filename1, filename2, no_auc, mixcase_threshold, train_with_mixcase, seed, three_classes, data_new, test_only, mixcase_as_mgt=mixcase_as_mgt)
        data_new['train']['text'] += mixcase_data['train']['text']
        data_new['test']['text'] += mixcase_data['test']['text']
        data_new['train']['label'] += mixcase_data['train']['label']
        data_new['test']['label'] += mixcase_data['test']['label']
    
    if test_only:
        data_new['train']['text'] = []
        data_new['train']['label'] = []
    else:
        combined = list(zip(data_new['train']['text'], data_new['train']['label']))
        random.seed(seed) 
        random.shuffle(combined)
        data_new['train']['text'], data_new['train']['label'] = zip(*combined)
        
    return data_new

def shuffle_and_trim_data(data_new, seed, train_threshold):
    """
    Shuffle and trim the data in data_new.

    Args:
        data_new (dict): The input data dictionary.
        seed (int): The seed value for random shuffling. If None, the default seed value is 0.
        train_threshold (int): The maximum number of data points to include in the trimmed data.

    Returns:
        tuple: A tuple containing the shuffled and trimmed text and label lists.
    """
    new_text = []
    new_label = []
    index_list = list(range(len(data_new['train']['label'])))
    random.seed(seed if seed is not None else 0)
    random.shuffle(index_list)

    for i in index_list:
        new_text.append(data_new['train']['text'][i])
        new_label.append(data_new['train']['label'][i])

    return new_text[:train_threshold], new_label[:train_threshold]

def process_mixcase_data(filename1, filename2, no_auc, mixcase_threshold, train_with_mixcase, seed, three_classes, data_new, test_only, mgt_only_gpt:bool=False, mixcase_as_mgt:bool=False):
    """
    Process the Mixcase data based on the given parameters.

    Args:
        filename1 (str): The filename for the first dataset.
        filename2 (str): The filename for the second dataset.
        no_auc (bool): Flag indicating whether to exclude AUC calculations.
        mixcase_threshold (float): The threshold for Mixcase data.
        train_with_mixcase (bool): Flag indicating whether to train with Mixcase data.
        seed (int): The seed value for randomization.
        three_classes (bool): Flag indicating whether to use three classes.
        data_new (dict): The dictionary to store the processed data.
        test_only (bool): Flag indicating whether to only perform testing.
        mgt_only_gpt (bool, optional): Flag indicating whether to use MGT generated by GPT-family.. Defaults to False.
        mixcase_as_mgt (bool, optional): Flag indicating whether to treat Mixcase as MGT. Defaults to False.

    Returns:
        dict: The processed Mixcase data.
    """
    data_new = {
        'train': {
            'text': [],
            'label': [],
        },
        'test': {
            'text': [],
            'label': [],
        }
    }
    if filename2 is None:
        if not train_with_mixcase:
            # Ex1, train with pure MGT and HWT, then test on Mixcase in binary classification setting.
            Mixcase_data = load_Mixcase(filename=filename1, no_auc=no_auc, train_with_mixcase=train_with_mixcase, mixcase_as_mgt=mixcase_as_mgt)
            data_new['test']['text'] += Mixcase_data['text']
            data_new['test']['label'] += Mixcase_data['label']
        else:
            # Load and process multiple files for Mixcase data
            load_and_process_multiple_mixcase_files(data_new, no_auc, mixcase_threshold, train_with_mixcase, seed, three_classes, mgt_only_gpt=mgt_only_gpt, mixcase_as_mgt=mixcase_as_mgt)
            Test_Mixcase_data = load_Mixcase(filename1, no_auc, mixcase_threshold, train_with_mixcase, seed, three_classes, mixcase_as_mgt=mixcase_as_mgt)
            data_new['test']['text'] += Test_Mixcase_data['test']['text']
            data_new['test']['label'] += Test_Mixcase_data['test']['label']
    else:
        # Ex3, Transfer learning setting, train on operation of filename1 and evaluate on filename2.
        Train_Mixcase_data = load_Mixcase(filename1, no_auc, mixcase_threshold, train_with_mixcase, seed, three_classes, mixcase_as_mgt=mixcase_as_mgt)
        data_new['train']['text'] += Train_Mixcase_data['train']['mixcase']['text']
        data_new['train']['label'] += Train_Mixcase_data['train']['mixcase']['label']
        data_new['train']['text'] += Train_Mixcase_data['train']['other']['text']
        data_new['train']['label'] += Train_Mixcase_data['train']['other']['label']
        Test_Mixcase_data = load_Mixcase(filename2, no_auc, mixcase_threshold, train_with_mixcase, seed, three_classes, mixcase_as_mgt=mixcase_as_mgt)
        data_new['test']['text'] += Test_Mixcase_data['test']['text']
        data_new['test']['label'] += Test_Mixcase_data['test']['label']

    if test_only:
        # set training data to none for quicker inference
        data_new['train']['text'] = []
        data_new['train']['label'] = []
    
    return data_new

def load_and_process_multiple_mixcase_files(data_new, no_auc, mixcase_threshold, train_with_mixcase, seed, three_classes, mgt_only_gpt, mixcase_as_mgt):
    """
    Load and process multiple Mixcase files.

    Args:
        data_new (dict): The dictionary containing the data.
        no_auc (bool): Flag indicating whether to exclude AUC data.
        mixcase_threshold (int): The threshold for including Mixcase data.
        train_with_mixcase (bool): Flag indicating whether to train with Mixcase data.
        seed (int): The random seed.
        three_classes (bool): Flag indicating whether to use three classes.
        mgt_only_gpt (bool): Flag indicating whether to use MGT generated by GPT-family.
        mixcase_as_mgt (bool): Flag indicating whether to treat Mixcase data as MGT data.

    Returns:
        None
    """
    file_paths = []
    for root, dirs, files in os.walk("/media/ssd/cdp/Mixcase/Mixcase/data/mixcase_data"):
        for file in files:
            if file.endswith('.json'):
                if mgt_only_gpt:
                    if "GPT" in file:
                        file_paths.append(os.path.join(root, file))
                file_paths.append(os.path.join(root, file))

    for file_path in file_paths:
        if mixcase_threshold != 0:
            Mixcase_data = load_Mixcase(file_path, no_auc, mixcase_threshold, train_with_mixcase, seed, three_classes=three_classes, mixcase_as_mgt=mixcase_as_mgt)
            if Mixcase_data['train']['other']['text'][0] in data_new['train']['text']:
                data_new['train']['text'] += Mixcase_data['train']['mixcase']['text']
                data_new['train']['label'] += Mixcase_data['train']['mixcase']['label']
            else:
                data_new['train']['text'] += Mixcase_data['train']['mixcase']['text']
                data_new['train']['label'] += Mixcase_data['train']['mixcase']['label']
                data_new['train']['text'] += Mixcase_data['train']['other']['text']
                data_new['train']['label'] += Mixcase_data['train']['other']['label']
            print(f"The size of MixText in training set: {len(data_new['train']['label'])}")

    # Optionally shuffle the data
    new_text, new_label = shuffle_and_trim_data(data_new, seed, len(data_new['train']['label']))
    data_new['train']['text'] = new_text
    data_new['train']['label'] = new_label

def load_HWT(HWT_size, seed:int=0):
    """
    Load data for the HWT (High Word Tolerance) dataset.

    Parameters:
    - HWT_size (int): The desired size of the dataset.
    - seed (int): The seed value for randomization (default is 0).

    Returns:
    - data_new (dict): A dictionary containing the loaded data with the following keys:
        - 'text': A list of processed sentences.
        - 'label': A numpy array of zeros with a length equal to the minimum of HWT_size and the total number of loaded sentences.
    """
    root = './data/pure_processed_HWT/'
    file_names = ['blog.json', 'game.json', 'News.json', 'paper_abstract.json', 'talks.json', 'transcript.json', 'email.json']
    data_sum = []
    for name in file_names:
        with open(root+name, 'r') as f:
            for line in f:
                # 解析每一行为JSON
                json_object = json.loads(line.strip())
                processed_sentence = process_spaces(json_object['sentence'])
                if len(processed_sentence.split()) > 30 and len(processed_sentence.split()) < 120 and len(processed_sentence) < 2000:
                    data_sum.append(processed_sentence)
    index_list = list(range(len(data_sum)))
    random.seed(seed)
    random.shuffle(index_list)
    
    data_new = {'text': [],
                'label': np.zeros(min(HWT_size,len(data_sum)), dtype=int)}
    for i in range(min(HWT_size,len(data_sum))):
        index = random.randint(50, 120)
        data_new['text'].append(' '.join(process_spaces(data_sum[index_list[i]]).split()[:min(index,len(data_sum[index_list[i]].split()))]))
    return data_new

def load_Mixcase(filename, no_auc:bool=False, mixcase_threshold: float=0.8, train_with_mixcase:bool = False, seed:int=None, three_classes:bool=False, mixcase_as_mgt:bool=False):
    """
    Load Mixcase dataset from a JSON file.

    Args:
        filename (str): The name of the JSON file.
        no_auc (bool): Flag indicating whether to exclude samples for AUC calculation.
        mixcase_threshold (float, optional): The threshold for determining the number of mixcase samples in the training set. Defaults to 0.8.
        train_with_mixcase (bool, optional): Flag indicating whether to train with mixcase samples. Defaults to False.
        seed (int, optional): The seed value for random shuffling. Defaults to None.
        three_classes (bool, optional): Flag indicating whether to use three classes for classification. Defaults to False.
        mixcase_as_mgt (bool, optional): Flag indicating whether to treat mixcase samples as MGT samples. Defaults to False.

    Returns:
        dict: A dictionary containing the loaded dataset.
    """
    with open(os.path.join("./data/MixSet",filename)) as file:
        f = json.load(file)
    mgt = []
    hwt = []
    for key in f[0].keys():
        if "output" in str(key):
            real_key = key
    if "HWT_sentence" in f[0].keys():
        test_MGT = True
        for i in f:
            # if  len(i['HWT_sentence'].split()) > 1 and len(i[real_key].split()) > 1 and len(i[real_key]) < 2000:
            mgt.append(i[real_key])
            hwt.append(i['HWT_sentence'])
    else:
        if not mixcase_as_mgt:
            test_MGT = False
            for i in f:
                mgt.append(i['MGT_sentence'])
                hwt.append(i[real_key])
        else:
            test_MGT = True
            for i in f:
                mgt.append(i[real_key])
            with open(os.path.join("../Mixcase/data/mixcase_data","2llama_polish_token.json")) as file:
                f2 = json.load(file)
            for i in f2:
                hwt.append(i['HWT_sentence'])
    
    print(f"Test_MGT:{test_MGT}")
    index_list = list(range(len(mgt) + len(hwt)))
    total_num = len(index_list)
    if not train_with_mixcase:
        data_new = {
                'text': [],
                'label': []
        }
        if test_MGT:
            for i in tqdm.tqdm(range(total_num), desc="parsing data"):
                if index_list[i] < len(mgt):
                    data_new['text'].append(
                        process_spaces(mgt[index_list[i]]))
                    data_new['label'].append(1)
                elif not no_auc:
                    index = index_list[i] - len(mgt)
                    data_new['text'].append(
                        process_spaces(hwt[index]))
                    data_new['label'].append(0)
        else:
            for i in tqdm.tqdm(range(total_num), desc="parsing data"):
                if index_list[i] < len(hwt):
                    data_new['text'].append(
                        process_spaces(hwt[index_list[i]]))
                    data_new['label'].append(0)
                elif not no_auc:
                    index = index_list[i] - len(hwt)
                    data_new['text'].append(
                        process_spaces(mgt[index]))
                    data_new['label'].append(1)
        return data_new
    else:
        data_new = {
            'train': {
                'mixcase':{
                    'text': [],
                    'label': [],
                },
                'other':{
                    'text': [],
                    'label': [],
                }
            },
            'test': {
                'text': [],
                'label': [],
            }
        }
        train_mgt = mgt[:250]
        test_mgt = mgt[250:]
        train_hwt = hwt[:250]
        test_hwt = hwt[250:]
        index_list = list(range(250))
        random.seed(seed)
        random.shuffle(index_list)
        if three_classes:
            if test_MGT:
                train_mixcase = train_mgt
                test_mixcase = test_mgt
                train_pure = train_hwt
                test_pure = test_hwt
                label = 0
            else:
                train_mixcase = train_hwt
                test_mixcase = test_hwt
                train_pure = train_mgt
                test_pure = test_mgt
                label = 1
            for i in tqdm.tqdm(range(int(len(index_list) * mixcase_threshold)), desc="parsing data"):
                data_new['train']['mixcase']['text'].append(
                    process_spaces(train_mixcase[index_list[i]]))
                data_new['train']['mixcase']['label'].append(2)
                data_new['train']['other']['text'].append(
                    process_spaces(train_pure[index_list[i]]))
                data_new['train']['other']['label'].append(label)
            for i in range(len(test_pure)):
                data_new['test']['text'].append(process_spaces(test_mixcase[i]))
                data_new['test']['label'].append(2)
                data_new['test']['text'].append(process_spaces(test_pure[i]))
                data_new['test']['label'].append(label)
        else:
            for i in tqdm.tqdm(range(int(len(index_list) * mixcase_threshold)), desc="parsing data"):
                if test_MGT:
                    data_new['train']['mixcase']['text'].append(
                        process_spaces(train_mgt[index_list[i]]))
                    data_new['train']['mixcase']['label'].append(1)
                    data_new['train']['other']['text'].append(
                        process_spaces(train_hwt[index_list[i]]))
                    data_new['train']['other']['label'].append(0)
                else:
                    data_new['train']['other']['text'].append(
                        process_spaces(train_mgt[index_list[i]]))
                    data_new['train']['other']['label'].append(1)
                    data_new['train']['mixcase']['text'].append(
                        process_spaces(train_hwt[index_list[i]]))
                    data_new['train']['mixcase']['label'].append(0)
            for i in range(len(test_mgt)):
                data_new['test']['text'].append(process_spaces(test_mgt[i]))
                data_new['test']['label'].append(1)
                data_new['test']['text'].append(process_spaces(test_hwt[i]))
                data_new['test']['label'].append(0)
        return data_new

def load_dataset(dataset_name, MGT_only_GPT):
    mgt = []
    hwt = []
    if dataset_name == 'TruthfulQA':
        f = pd.read_csv("./data/MGT_datasets/modified_TruthfulQA_LLMs.csv")
        a_human = f['Best Answer'].tolist()
        for i in range(len(a_human)):
            processed_human = process_spaces(a_human[i])
            if len(processed_human.split()) > 50 and len(processed_human) < 2000:
                hwt.append(processed_human)
    elif dataset_name == 'SQuAD1':
        f = pd.read_csv("./data/MGT_datasets/modified_SQuAD1_LLMs.csv")
        a_human = [eval(_)['text'][0] for _ in f['answers'].tolist()]
        for i in range(len(a_human)):
            processed_human = process_spaces(a_human[i])
            if len(processed_human.split()) > 50 and len(processed_human) < 2000:
                hwt.append(processed_human)
    elif dataset_name == 'NarrativeQA':
        f = pd.read_csv("./data/MGT_datasets/modified_NarrativeQA_LLMs.csv")
        a_human = f['answers'].tolist()
        a_human = [_.split(";")[0] for _ in a_human]
        for i in range(len(a_human)):
            processed_human = process_spaces(a_human[i])
            if len(processed_human.split()) > 50 and len(processed_human) < 2000:
                hwt.append(processed_human)
    
    if  MGT_only_GPT:
        # Ex3 LLM-wise transfer, only train with MGT generated by GPT-family
        LLM_set = ['ChatGPT', 'ChatGPT-turbo', 'GPT4']
    else:
        LLM_set = ['ChatGPT', 'BloomZ', 'ChatGLM', 'Dolly', 'ChatGPT-turbo', 'GPT4', 'StableLM']
        
    for detectLLM in LLM_set:
        a_chat = f[f'{detectLLM}_answer'].fillna("").tolist()
        for i in range(len(a_chat)):
            processed_chat = process_spaces(a_chat[i])
            if len(processed_chat.split()) > 50 and len(processed_chat) < 2000:
                mgt.append(processed_chat)
    
    data_new = {
        'text': [],
        'label': [],
    }

    index_list = list(range(len(mgt) + len(hwt)))
    random.seed(0)
    random.shuffle(index_list)
    
    mgt_cnt = 0
    hwt_cnt = 0
    total_num = len(index_list)
    for i in tqdm.tqdm(range(total_num), desc="parsing data"):
        if index_list[i] < len(mgt):
            data_new['text'].append(
                mgt[index_list[i]])
            data_new['label'].append(1)
            mgt_cnt += 1
        else:
            index = index_list[i] - len(mgt)
            data_new['text'].append(
                hwt[index])
            data_new['label'].append(0)
            hwt_cnt += 1
    return data_new, hwt_cnt, mgt_cnt

def load_TruthfulQA(detectLLM):
    f = pd.read_csv("datasets/TruthfulQA_LLMs.csv")
    q = f['Question'].tolist()
    a_human = f['Best Answer'].tolist()
    a_chat = f[f'{detectLLM}_answer'].fillna("").tolist()
    c = f['Category'].tolist()

    res = []
    for i in range(len(q)):
        if len(a_human[i].split()) > 1 and len(a_chat[i].split()) > 1 and len(a_chat[i]) < 2000:
            res.append([q[i], a_human[i], a_chat[i], c[i]])

    data_new = {
        'train': {
            'text': [],
            'label': [],
            'category': [],
        },
        'test': {
            'text': [],
            'label': [],
            'category': [],
        }

    }

    index_list = list(range(len(res)))
    random.seed(0)
    random.shuffle(index_list)

    total_num = len(res)
    for i in tqdm.tqdm(range(total_num), desc="parsing data"):
        if i < total_num * 0.8:
            data_partition = 'train'
        else:
            data_partition = 'test'
        data_new[data_partition]['text'].append(
            process_spaces(res[index_list[i]][1]))
        data_new[data_partition]['label'].append(0)
        data_new[data_partition]['text'].append(
            process_spaces(res[index_list[i]][2]))
        data_new[data_partition]['label'].append(1)

        data_new[data_partition]['category'].append(res[index_list[i]][3])
        data_new[data_partition]['category'].append(res[index_list[i]][3])

    return data_new


def load_SQuAD1(detectLLM):
    f = pd.read_csv("datasets/SQuAD1_LLMs.csv")
    q = f['Question'].tolist()
    a_human = [eval(_)['text'][0] for _ in f['answers'].tolist()]
    a_chat = f[f'{detectLLM}_answer'].fillna("").tolist()

    res = []
    for i in range(len(q)):
        if len(a_human[i].split()) > 1 and len(a_chat[i].split()) > 1:
            res.append([q[i], a_human[i], a_chat[i]])

    data_new = {
        'train': {
            'text': [],
            'label': [],
        },
        'test': {
            'text': [],
            'label': [],
        }

    }

    index_list = list(range(len(res)))
    random.seed(0)
    random.shuffle(index_list)

    total_num = len(res)
    for i in tqdm.tqdm(range(total_num), desc="parsing data"):
        if i < total_num * 0.8:
            data_partition = 'train'
        else:
            data_partition = 'test'

        data_new[data_partition]['text'].append(
            process_spaces(res[index_list[i]][1]))
        data_new[data_partition]['label'].append(0)
        data_new[data_partition]['text'].append(
            process_spaces(res[index_list[i]][2]))
        data_new[data_partition]['label'].append(1)
    return data_new


def load_NarrativeQA(detectLLM):
    f = pd.read_csv("datasets/NarrativeQA_LLMs.csv")
    q = f['Question'].tolist()
    a_human = f['answers'].tolist()
    a_human = [_.split(";")[0] for _ in a_human]
    a_chat = f[f'{detectLLM}_answer'].fillna("").tolist()

    res = []
    for i in range(len(q)):
        if len(a_human[i].split()) > 1 and len(a_chat[i].split()) > 1 and len(a_human[i].split()) < 150 and len(a_chat[i].split()) < 150:

            res.append([q[i], a_human[i], a_chat[i]])

    data_new = {
        'train': {
            'text': [],
            'label': [],
        },
        'test': {
            'text': [],
            'label': [],
        }

    }

    index_list = list(range(len(res)))
    random.seed(0)
    random.shuffle(index_list)

    total_num = len(res)
    for i in tqdm.tqdm(range(total_num), desc="parsing data"):
        if i < total_num * 0.8:
            data_partition = 'train'
        else:
            data_partition = 'test'
        data_new[data_partition]['text'].append(
            process_spaces(res[index_list[i]][1]))
        data_new[data_partition]['label'].append(0)
        data_new[data_partition]['text'].append(
            process_spaces(res[index_list[i]][2]))
        data_new[data_partition]['label'].append(1)
    return data_new


def load(name, **kwargs):
    if name in DATASETS:
        load_fn = globals()[f'load_{name}']
        return load_fn(**kwargs)
    else:
        raise ValueError(f'Unknown dataset {name}')