sgnr_new.py


# -*- coding: utf-8 -*-
"""systematicity-experiment-data-generation-v2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1R7smQachUTswGMFNgsETtb8Zh3nZGIiZ
"""

import json
# Commented out IPython magic to ensure Python compatibility.
import os
import pickle
import random
from collections import OrderedDict

import pandas as pd
from sklearn.model_selection import train_test_split

from ast import literal_eval

import warnings
warnings.filterwarnings("ignore")

import cProfile
cp = cProfile.Profile()
cp.enable()

import copy


# !pip install snakeviz
# %load_ext snakeviz

# %load_ext line_profiler

# !ls -l
# clear_output()

# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)
# root_dir = "/content/gdrive/My Drive/School/PhD/Colab Notebooks/"
# os.chdir(root_dir)
# clear_output()

# os.chdir("compositionality")

def load_json(file_name):
    with open(file_name) as data_file:
        data = json.load(data_file, object_pairs_hook=OrderedDict)
    return data


def save_json(item, file_path):
    with open(file_path, 'w+') as fp:
        json_str = json.dumps(item, indent=4)
        fp.write(json_str)


def generate(thr=0):

    verb_classes = pd.read_csv('./annotations/EPIC_100_verb_classes.csv', converters={'instances': literal_eval},
                               index_col='id')
    noun_classes = pd.read_csv('./annotations/EPIC_100_noun_classes.csv', converters={'instances': literal_eval},
                               index_col='id')

    train_labels = pd.read_csv('./annotations/EPIC_100_train.csv', index_col='narration_id')
    labels = pd.concat([train_labels], sort=False)

    len(verb_classes), len(noun_classes)


    def tokenize_multiword(noun, verb, narr):

        if "-" in verb:
            v1, v2 = verb.split("-")
            narr = narr.replace(v1 + " " + v2, v1 + "-" + v2)
        if ":" in noun:
            ns = noun.split(":")
            if len(ns) == 2:
               n2, n1 = ns[0], ns[1]
               narr = narr.replace(n1 + " " + n2, n1 + "_" + n2)
            if len(ns) == 3:
               n3, n2, n1 = ns[0], ns[1], ns[2]
               narr = narr.replace(n1 + " " + n2 + " " + n3, n1 + "_" + n2 + "_" + n3)

        return narr

    pairs = {}
    narration_data = []

    source = []
    target = []
    U_ALL = OrderedDict()

    for item in labels.video_id.values:
        if item not in pairs:
            pairs[item] = 0

    window_size = 3

    for k, v in pairs.items():
        video_data = labels[labels['video_id'] == k].sort_values(by="start_timestamp")
        count = 0

        nouns = video_data.noun.to_list()
        verbs = video_data.verb.to_list()
        narrations = video_data.narration.to_list()

        bad_words = ["still", "continue"]

        for i, (noun, verb, narration) in enumerate(zip(nouns, verbs, narrations)):

            if i < len(nouns) - window_size:
                bag = []
                sack = [tokenize_multiword(noun, verb, narration + " . ")]
                narration_composition_sack = [verb + "|" + noun]
                narration_idx = [str(i)]
                skip = False

                for j in range(window_size):
                    bag.append(nouns[i + j + 1])
                    sack.append(tokenize_multiword(nouns[i+j+1], verbs[i+j+1], narrations[i + j + 1]) + " . ")
                    narration_composition_sack.append("+" + verbs[i + j + 1] + "|" + nouns[i + j + 1])
                    narration_idx.append(str(i + j + 1))

                if not len(sack) == len(set(sack)):  # skip repeating narrations
                    continue

                for w in bad_words:
                    if w in "".join(sack):
                        skip = True

                if skip:
                    continue

                key = bag.pop()
                bag.append(noun)

                if key in bag:
                    count = count + 1
                    narration_text_tagged = "".join(sack)

                    narration_text_tagged_new = []
                    for item in narration_text_tagged.split(" "):
                        if item in nouns:
                            item = item + "[N" + str(nouns.index(item)) + "]"
                        elif item in verbs:
                            item = item + "[V" + str(verbs.index(item)) + "]"
                        narration_text_tagged_new.append(item)

                    narration_text_tagged_text = " ".join(narration_text_tagged_new)
                    narration_composition_keys = "".join(
                        narration_composition_sack)  # TADA: We have the whole compositions for any narration!!!
                    narration_data.append(narration_text_tagged_text)
                    trg = sack.pop()  # + " . "
                    src = "".join(sack)  # + " . "
                    source.append(src)
                    target.append(trg)

                    U_id = k + "+" + "_".join(
                        narration_idx)  # in the format of video_id + '+' + narration_index seperated by '_'
                    U_ALL[U_id] = narration_composition_keys

        pairs[k] = count

    verb_noun_compositions = {}
    atom_distribution = OrderedDict()
    compound_distribution = OrderedDict()

    all_dict = {}

    for i, (k, v) in enumerate(U_ALL.items()):

        video_id, narration_idx_str = k.split("+")

        compositions_in_narration = v.split("+")

        for composition in compositions_in_narration:

            verb, noun = composition.split("|")

            if verb not in all_dict:
                all_dict[verb] = {}
            else:
                if noun not in all_dict[verb]:
                    all_dict[verb][noun] = 1
                else:
                    all_dict[verb][noun] = all_dict[verb][noun] + 1

            if composition not in verb_noun_compositions:
                verb_noun_compositions[composition] = 1
            else:
                verb_noun_compositions[composition] = verb_noun_compositions[composition] + 1

            # ATOM DISTRIBUTION
            if noun not in atom_distribution:
                atom_distribution[noun] = 1
            else:
                atom_distribution[noun] += 1

            if verb not in atom_distribution:
                atom_distribution[verb] = 1
            else:
                atom_distribution[verb] += 1

            # COMPOUND DISTRIBUTION
            if composition not in compound_distribution:
                compound_distribution[composition] = 1
            else:
                compound_distribution[composition] += 1

    save_json(all_dict, "all_compositions_newest.json")

    """**Greedy Fast Approach**"""

    # def zero_division(n, d):
    #     return n / d if d else 0

    def chernoff_fast(hist1, hist2, alfa):
        """
        Measure divergence (or similarity) of the weighted distributions using
        the Chernoff coefficient Cα(P ∥Q) = 􏰁sum(p^α q^1−α) ∈ [0, 1] (Chung et al., 1989).
        """

        chernoff_coef = 0.0
        alfa_minus = 1 - alfa
        div_coeff = 1e-32  # to avoid division by zero

        total_items_1 = sum(hist1) + div_coeff
        total_items_2 = sum(hist2) + div_coeff

        # h1 = set(compress(itertools.count(), hist1))
        # h2 = set(compress(itertools.count(), hist2))

        # nonzeros = h1 & h2

        # for inx in nonzeros:
        #     p = hist1[inx] / total_items_1
        #     q = hist2[inx] / total_items_2
        for i, (item1, item2) in enumerate(zip(hist1, hist2)):
            p = item1/total_items_1
            q = item2/total_items_2
            
            chernoff_coef += p ** alfa * q ** alfa_minus

        return chernoff_coef

    # def get_compound_freq(L, RL):
    #     cf = [0] * len(RL)
    #
    #     S = set(L)
    #     # RL = list(R.keys())
    #
    #     for s in S:
    #         cf[RL.index(s)] = L.count(s)
    #
    #     return cf

    def get_compound_freq_tabled(CF, compounds):

        for compound in compounds:
            CF[compound] += 1

        return CF

    def get_compound_freq_tabled_remove(CF, compounds):

        for compound in compounds:
            CF[compound] -= 1

        return CF

    # def get_atom_freq(L, RL):
    #     atoms = []  # [0] * len(R)
    #
    #     for compound in L:
    #         atoms.extend(compound.split("|"))
    #
    #     af = [0] * len(RL)
    #
    #     S = set(atoms)
    #
    #     for s in S:
    #         af[RL.index(s)] = atoms.count(s)
    #
    #     return af

    def get_atom_freq_tabled(AF, atoms):

        for atom in atoms:
            AF[atom] += 1

        return AF

    def get_atom_freq_tabled_remove(AF, atoms):

        for atom in atoms:
            AF[atom] -= 1

        return AF

    # def get_divergence(V, W, ADL, CDL, atom_divergence, compound_divergence):
    #     V_flat = []
    #     W_flat = []
    #
    #     for item in V:
    #         V_flat.extend(item.split("+"))
    #
    #     for item in W:
    #         W_flat.extend(item.split("+"))
    #
    #     FC_V = get_compound_freq(V_flat, CDL)
    #     FC_W = get_compound_freq(W_flat, CDL)
    #     FA_V = get_atom_freq(V_flat, ADL)
    #     FA_W = get_atom_freq(W_flat, ADL)
    #
    #     # DC(V∥W)=1 − C0.1(FC(V)∥FC(W))
    #     # DA(V∥W)=1 − C0.5(FA(V)∥FA(W))
    #     # According to chernoff coeff.
    #     # Cα(P ∥Q) = 􏰁 pα q1−α ∈ [0, 1]
    #     chernoff_coef_C = chernoff_fast(FC_V, FC_W, compound_divergence)
    #     chernoff_coef_A = chernoff_fast(FA_V, FA_W, atom_divergence)
    #     DC_VW = 1.0 - chernoff_coef_C
    #     DA_VW = 1.0 - chernoff_coef_A
    #
    #     return DA_VW, FA_V, FA_W, DC_VW, FC_V, FC_W
    #
    # def get_divergence_what_if_atom_only(V_flat, W_flat, ADL, atom_divergence):
    #     FA_V = get_atom_freq(V_flat, ADL)
    #     FA_W = get_atom_freq(W_flat, ADL)
    #
    #     # DA(V∥W)=1 − C0.5(FA(V)∥FA(W))
    #     # According to chernoff coeff.
    #     # Cα(P ∥Q) = 􏰁 pα q1−α ∈ [0, 1]
    #     chernoff_coef_A = chernoff_fast(FA_V, FA_W, atom_divergence)
    #     DA_VW = 1.0 - chernoff_coef_A
    #
    #     return DA_VW, FA_V, FA_W

    def get_divergence_what_if_atom_only_tabled(S, AF_table_V, AF_table_W, atoms, atom_divergence):
        # FA_V = get_atom_freq(V_flat, ADL)
        # FA_W = get_atom_freq(W_flat, ADL)

        if S == "V":
            AF_table_V = get_atom_freq_tabled(AF_table_V, atoms)
        elif S == "W":
            AF_table_W = get_atom_freq_tabled(AF_table_W, atoms)

        # FA_V = list(AF_table_V.values())
        # FA_W = list(AF_table_W.values())

        # DA(V∥W)=1 − C0.5(FA(V)∥FA(W))
        # According to chernoff coeff.
        # Cα(P ∥Q) = 􏰁 pα q1−α ∈ [0, 1]
        chernoff_coef_A = chernoff_fast(list(AF_table_V.values()), list(AF_table_W.values()), atom_divergence)
        DA_VW = 1.0 - chernoff_coef_A

        return DA_VW, AF_table_V, AF_table_W

    def get_divergence_what_if_compound_only_tabled(S,CF_table_V,CF_table_W, compounds, compound_divergence):

        if S == "V":
            CF_table_V = get_compound_freq_tabled(CF_table_V, compounds)
        elif S == "W":
            CF_table_W = get_compound_freq_tabled(CF_table_W, compounds)

        # FC_V = list(CF_table_V.values())
        # FC_W = list(CF_table_W.values())

        # DC(V∥W)=1 − C0.1(FC(V)∥FC(W))
        # According to chernoff coeff.
        # Cα(P ∥Q) = 􏰁 pα q1−α ∈ [0, 1]
        chernoff_coef_C = chernoff_fast(list(CF_table_V.values()), list(CF_table_W.values()), compound_divergence)
        DC_VW = 1.0 - chernoff_coef_C

        return DC_VW, CF_table_V, CF_table_W

    # def get_divergence_what_if_compound_only(V_flat, W_flat, CDL, compound_divergence):
    #     FC_V = get_compound_freq(V_flat, CDL)
    #     FC_W = get_compound_freq(W_flat, CDL)
    #
    #     # DC(V∥W)=1 − C0.1(FC(V)∥FC(W))
    #     # According to chernoff coeff.
    #     # Cα(P ∥Q) = 􏰁 pα q1−α ∈ [0, 1]
    #     chernoff_coef_C = chernoff_fast(FC_V, FC_W, compound_divergence)
    #     DC_VW = 1.0 - chernoff_coef_C
    #
    #     return DC_VW, FC_V, FC_W

    def greedy_fast(thr=0):
        # beginning of the greedy algorithm described in the paper

        AD = OrderedDict(atom_distribution)
        CD = OrderedDict(compound_distribution)

        AF_table_V = OrderedDict()
        CF_table_V = OrderedDict()
        AF_table_W = OrderedDict()
        CF_table_W = OrderedDict()

        for k,v in AD.items():
            AF_table_V[k] = 0
            AF_table_W[k] = 0

        for k,v in CD.items():
            CF_table_V[k] = 0
            CF_table_W[k] = 0

        # ADL = list(AD.keys())
        # CDL = list(CD.keys())
        U = list(U_ALL.values())

        atom_divergence = 0.5  # cf. Keysers et al 2020
        compound_divergence = 0.1  # cf. Keysers et al 2020

        # DC(V∥W)=1 − C0.1(FC(V)∥FC(W))
        # DA(V∥W)=1 − C0.5(FA(V)∥FA(W))

        # To construct such an experiment for a dataset U and a desired combination of atom and compound divergences,
        # we use an iterative greedy algorithm that starts with empty sets V (train) and W (test), and then alternates
        # between adding an example u ∈ U to V or W (while maintaining the desired train/test ratio).
        # At each iteration, the element u is selected such that DC (V ∥W ) and DA (V ∥W ) are kept as closely as
        # possible to the desired values.
        # To reduce the risk of being stuck in a local optimum, we also allow removing examples at certain iterations.

        quit = 0 # termination counter
        V = []  # train split
        W = []  # test split
        V_idx = []
        W_idx = []

        V_dict = OrderedDict()
        W_dict = OrderedDict()

        # Splits = {'V': V_dict, 'W': W_dict}

        # bins_C = [x for x in range(0, len(CD))]
        # bins_A = [x for x in range(0, len(AD))]
        U_small = list(U[0:])

        DA_VW = 0.0
        i = 0
        inx_U = [x for x in range(0, len(U_small))]

        atoms_dict_inx = OrderedDict()
        compounds_dict_inx = OrderedDict()

        for u_ind in inx_U:
            u = U[u_ind]

            compounds = u.split("+")
            compounds_dict_inx[u_ind] = compounds
            atom_list = []

            for compound in compounds:
                atom_list.extend(compound.split("|"))

            atoms_dict_inx[u_ind] = list(atom_list)

        # V = {}  # Train
        # W = {}  # Test
        # Until all samples u \in U are assigned to V or W:
        #   1. Pick which split S to add a sample to next.
        #   2. For each u \in U that wasn't assigned yet:
        #   2a. if S = V: V' = V \union u, W' = W, else V' = V, W' = W \union u
        #   2b. Compute potential atom divergence D_A(V' || W') and compound divergence D_C(V' || W')
        #   3. Select the u which is best (D_A and D_C close to target)

        def add_to_split(S, V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V, CF_table_W, atoms, compounds, u, ind, inx_U):

            if S == "V":
                V.append(u)  # train split
                V_idx.append(ind)
                V = list(set(V))
                AF_table_V = get_atom_freq_tabled(AF_table_V, atoms)
                CF_table_V = get_compound_freq_tabled(CF_table_V, compounds)

            else:  # W:
                W.append(u)  # test split
                W_idx.append(ind)
                W = list(set(W))
                AF_table_W = get_atom_freq_tabled(AF_table_W, atoms)
                CF_table_W = get_compound_freq_tabled(CF_table_W, compounds)

            inx_U.remove(ind)  # remove what we've added so that we don't add it to another split!

            return V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V, CF_table_W, inx_U

        # def add_to_split_new(S, Splits, u, ind, inx_U):
        #
        #     # Splits[S]
        #
        #     if S == "V":
        #         V.append(u)  # train split
        #         V_idx.append(ind)
        #         V = list(set(V))
        #     else:  # W:
        #         W.append(u)  # test split
        #         W_idx.append(ind)
        #         W = list(set(W))
        #
        #     inx_U.remove(ind)  # remove what we've added so that we don't add it to another split!

        def remove_from_split(S, V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V,CF_table_W,atoms, compounds, u, ind, inx_U):

            if S == "V":
                V.remove(u)  # train split
                V_idx.remove(ind)
                V = list(set(V))
                AF_table_V = get_atom_freq_tabled_remove(AF_table_V, atoms)
                CF_table_V = get_compound_freq_tabled_remove(CF_table_V, compounds)
            else:  # W:
                W.remove(u)  # test split
                W_idx.remove(ind)
                W = list(set(W))
                AF_table_W = get_atom_freq_tabled_remove(AF_table_W, atoms)
                CF_table_W = get_compound_freq_tabled_remove(CF_table_W, compounds)

            inx_U.append(ind)  # add what we've removed so that we don't remove it from another split!

            return V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V,CF_table_W, inx_U


        # ===============================================================================================

        # read files from directory
        # idx, keys, src, trg (train/test/val)
        _train_idx = load_json("8500/train_idx.json")
        _test_idx = load_json("8500/test_idx.json")
        _val_idx = load_json("8500/val_idx.json")
        # _train_keys = json.loads("9000/train_keys.json")
        # _test_keys = json.loads("9000/test_keys.json")
        # _val_keys = json.loads("9000/val_keys.json")

        S = "V"

        for ind in _train_idx:
            u = U[ind]
            V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V, CF_table_W, inx_U = add_to_split(S, V, W,
                                                                                                     V_idx, W_idx,
                                                                                                     AF_table_V,
                                                                                                     AF_table_W,
                                                                                                     CF_table_V,
                                                                                                     CF_table_W,
                                                                                                     atoms_dict_inx[
                                                                                                         ind],
                                                                                                     compounds_dict_inx[
                                                                                                         ind],
                                                                                                     u,
                                                                                                     ind,
                                                                                                     inx_U)
            i += 1
            print("%s Adding %s \t\t to %s \t Remaining: %s" % (i, u, S, len(inx_U)))

        S = "W"

        for ind in _test_idx:
            u = U[ind]
            V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V, CF_table_W, inx_U = add_to_split(S, V, W,
                                                                                                     V_idx, W_idx,
                                                                                                     AF_table_V,
                                                                                                     AF_table_W,
                                                                                                     CF_table_V,
                                                                                                     CF_table_W,
                                                                                                     atoms_dict_inx[
                                                                                                         ind],
                                                                                                     compounds_dict_inx[
                                                                                                         ind],
                                                                                                     u,
                                                                                                     ind,
                                                                                                     inx_U)
            i += 1
            print("%s Adding %s \t\t to %s \t Remaining: %s" % (i, u, S, len(inx_U)))

        S = "W"

        for ind in _val_idx:
            u = U[ind]
            V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V, CF_table_W, inx_U = add_to_split(S, V, W,
                                                                                                     V_idx, W_idx,
                                                                                                     AF_table_V,
                                                                                                     AF_table_W,
                                                                                                     CF_table_V,
                                                                                                     CF_table_W,
                                                                                                     atoms_dict_inx[
                                                                                                         ind],
                                                                                                     compounds_dict_inx[
                                                                                                         ind],
                                                                                                     u,
                                                                                                     ind,
                                                                                                     inx_U)
            i += 1
            print("%s Adding %s \t\t to %s \t Remaining: %s" % (i, u, S, len(inx_U)))

        # ===============================================================================================

        while len(inx_U) > 0:  # continue until no item left to allocate

            # STEP 1 : Determine which split S to add u
            probability = random.random()

            if probability > 0.5:  # add u to V
                S = "V"
            else:  # add u to W
                S = "W"

            if i == 0:  # if it is the first item pick randomly
                ind = random.choice(inx_U)
                u = U[ind]
                V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V, CF_table_W, inx_U = add_to_split(S, V, W,
                                                                                                         V_idx, W_idx,
                                                                                                         AF_table_V,
                                                                                                         AF_table_W,
                                                                                                         CF_table_V,
                                                                                                         CF_table_W,
                                                                                                         atoms_dict_inx[ind],
                                                                                                         compounds_dict_inx[ind],
                                                                                                         u,
                                                                                                         ind,
                                                                                                         inx_U)
                i += 1
                print("%s Adding %s \t\t to %s \t Remaining: %s" % (i, u, S,len(inx_U)))
            else:

                # STEP 2. For each u \in U that wasn't assigned yet:

                divergence_scores_C = OrderedDict()
                divergence_scores_A = OrderedDict()

                for u_ind in inx_U:

                    # u_tmp = U[u_ind]
                    # V_tmp = list(V)
                    # W_tmp = list(W)

                    # ----------------
                    AF_table_V_tmp = OrderedDict(AF_table_V)
                    AF_table_W_tmp = OrderedDict(AF_table_W)

                    DA_VW, _AF_table_V, _AF_table_W = get_divergence_what_if_atom_only_tabled(S, AF_table_V_tmp,
                                                                                                        AF_table_W_tmp,
                                                                                                        atoms_dict_inx[u_ind],
                                                                                    atom_divergence)  # train split
                    
                    divergence_scores_A[u_ind] = DA_VW

                    '''

                    DA_VW, FA_V, FA_W, DC_VW, FC_V, FC_W = get_divergence_what_if(V_flat, W_flat, ADL, CDL,
                                                                                  atom_divergence,
                                                                                  compound_divergence)

                    divergence_scores_C[u_ind] = DC_VW  # {"DC_VW": DC_VW, "DA_VW": DA_VW}
                    divergence_scores_A[u_ind] = DA_VW
                    divergence_scores_AC_ratio[u_ind] = DA_VW / DC_VW
                    '''

                filtered_inx_U_by_A = [k for k,v in divergence_scores_A.items() if v <= 0.02]

                if len(filtered_inx_U_by_A) == 0:
                    best_u_ind_A = min(divergence_scores_A, key=divergence_scores_A.get)
                    filtered_inx_U_by_A.append(best_u_ind_A)
                # elif len(filtered_inx_U_by_A) > 10:
                #     filtered_inx_U_by_A = random.sample(filtered_inx_U_by_A, 10)
                    # print(filtered_inx_U_by_A, divergence_scores_A[filtered_inx_U_by_A[0]])
                # else:
                #print(len(filtered_inx_U_by_A))

                for u_ind in inx_U:

                    if u_ind in filtered_inx_U_by_A:
                        # u_tmp = U[u_ind]

                        CF_table_V_tmp = OrderedDict(CF_table_V)
                        CF_table_W_tmp = OrderedDict(CF_table_W)

                        DC_VW, _CF_table_V, _CF_table_W = get_divergence_what_if_compound_only_tabled(S,CF_table_V_tmp,
                                                                                                    CF_table_W_tmp,
                                                                                                    compounds_dict_inx[u_ind],
                                                                                                    compound_divergence)

                        divergence_scores_C[u_ind] = DC_VW
                    else:
                        divergence_scores_C[u_ind] = 0  # here we set compound divergence to zero for filtered items

                # At each iteration, the element u is selected such that
                # DC (V ∥W ) and DA (V ∥W ) are kept as closely as possible to the desired values.
                best_u_ind = max(divergence_scores_C, key=divergence_scores_C.get)
                u = U[best_u_ind]

                if divergence_scores_C[best_u_ind] <= 0.60:
                    quit += 1

                if quit == 2:
                    print("Terminating now...")
                    break

                V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V, CF_table_W, inx_U = add_to_split(S, V, W,
                                                                                                         V_idx, W_idx,
                                                                                                         AF_table_V,
                                                                                                         AF_table_W,
                                                                                                         CF_table_V,
                                                                                                         CF_table_W,
                                                                                                         atoms_dict_inx[best_u_ind],
                                                                                                         compounds_dict_inx[best_u_ind],
                                                                                                         u,
                                                                                                         best_u_ind,
                                                                                                         inx_U)

                print("%s : %s - Adding %s \t to %s \t with A: %.4f \t with C: %.4f \
                \t Remaining: %s" % (i, len(filtered_inx_U_by_A), u, S,
                                                                  divergence_scores_A[best_u_ind],
                                                                  divergence_scores_C[best_u_ind],
                                                                  # DA_VW,
                                                                  # DC_VW,
                                                                  len(inx_U)))


                if i % 50 == 0:
                    if S == "V":
                        rnd_u_ind = random.choice(V_idx)
                        u = U[rnd_u_ind]

                    else:  # W:
                        rnd_u_ind = random.choice(W_idx)
                        u = U[rnd_u_ind]

                    V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V,CF_table_W, inx_U = remove_from_split(S, V, W, V_idx, W_idx, AF_table_V, AF_table_W, CF_table_V,CF_table_W, atoms_dict_inx[rnd_u_ind], compounds_dict_inx[rnd_u_ind], u, rnd_u_ind, inx_U)
                    print("%s Removing %s \t from %s \t with indice: %s \t Remaining: %s" % (i, u, S, rnd_u_ind,len(inx_U)))

                # Plot distribution of train and test splits while addding the new data
                # if i % 50 == 0:
                
                #     plt.figure(figsize=(22,6))
                #     plt.grid(alpha=0.1, linestyle='--', linewidth=1)
                #     plt.hist([bins_A,bins_A], bins=bins_A, weights=[FA_V,FA_W], color=['blue', 'red'], alpha=0.5,
                #  label=['FA_V', 'FA_W'])
                #     plt.legend(loc='upper right')
                #     plt.show()
                
                #     plt.figure(figsize=(22,6))
                #     plt.grid(alpha=0.1, linestyle='--', linewidth=1)
                #     plt.hist([bins_C,bins_C], bins=bins_C, weights=[FC_V,FC_W], color=['green', 'orange'], alpha=0.5,
                #  label=['FC_V', 'FC_W'])
                #     plt.legend(loc='upper right')
                #     plt.show()

                # TODO: Based on Atomic divergence < 0.02
                # To reduce the risk of being stuck in a local optimum, we also allow removing examples at certain iterations.

            if thr > 0 and thr == i:
                break

            if i % 100 == 0 and i != 0:

                U_items = list(U_ALL.items())

                V_items = [U_items[i][0] for i in V_idx]
                W_items = [U_items[i][0] for i in W_idx]

                train_idx = list(V_idx)
                val_idx, test_idx = train_test_split(W_idx, test_size=0.50, random_state=42)

                train_keys = list(V_items)
                val_keys = [W_items[W_idx.index(item)] for item in val_idx]
                test_keys = [W_items[W_idx.index(item)] for item in test_idx]

                X_train = []
                Y_train = []
                X_test = []
                Y_test = []
                X_val = []
                Y_val = []

                for z in train_idx:
                    X_train.append(source[z])
                    Y_train.append(target[z])

                for z in val_idx:
                    X_val.append(source[z])
                    Y_val.append(target[z])

                for z in test_idx:
                    X_test.append(source[z])
                    Y_test.append(target[z])

                save_json(train_keys, "train_keys.json")
                save_json(val_keys, "val_keys.json")
                save_json(test_keys, "test_keys.json")

                save_json(train_idx, "train_idx.json")
                save_json(val_idx, "val_idx.json")
                save_json(test_idx, "test_idx.json")

                with open('train.src', 'w') as f:
                    for item in X_train:
                        f.write("%s\n" % item)

                with open('val.src', 'w') as f:
                    for item in X_val:
                        f.write("%s\n" % item)

                with open('test.src', 'w') as f:
                    for item in X_test:
                        f.write("%s\n" % item)

                with open('train.trg', 'w') as f:
                    for item in Y_train:
                        f.write("%s\n" % item)

                with open('val.trg', 'w') as f:
                    for item in Y_val:
                        f.write("%s\n" % item)

                with open('test.trg', 'w') as f:
                    for item in Y_test:
                        f.write("%s\n" % item)

                freqs = {"FA_V":list(AF_table_V.values()), "FA_W":list(AF_table_W.values()),
                         "FC_V":list(CF_table_V.values()), "FC_W":list(CF_table_W.values())}

                with open('frequencies.pkl', 'wb') as handle:
                    pickle.dump(freqs, handle, protocol=pickle.HIGHEST_PROTOCOL)

            if i % 500 == 0 and i != 0:

                os.mkdir(str(i))

                #V_items = [U[z][0] for z in V_idx]
                #W_items = [U[z][0] for z in W_idx]

                U_items = list(U_ALL.items())

                V_items = [U_items[i][0] for i in V_idx]
                W_items = [U_items[i][0] for i in W_idx]

                train_idx = list(V_idx)
                val_idx, test_idx = train_test_split(W_idx, test_size=0.50, random_state=42)

                train_keys = list(V_items)
                val_keys = [W_items[W_idx.index(item)] for item in val_idx]
                test_keys = [W_items[W_idx.index(item)] for item in test_idx]

                X_train = []
                Y_train = []
                X_test = []
                Y_test = []
                X_val = []
                Y_val = []

                for z in train_idx:
                    X_train.append(source[z])
                    Y_train.append(target[z])

                for z in val_idx:
                    X_val.append(source[z])
                    Y_val.append(target[z])

                for z in test_idx:
                    X_test.append(source[z])
                    Y_test.append(target[z])

                save_json(train_keys,os.path.join(str(i), "train_keys.json"))
                save_json(val_keys,os.path.join(str(i), "val_keys.json"))
                save_json(test_keys, os.path.join(str(i), "test_keys.json"))

                save_json(train_idx, os.path.join(str(i), "train_idx.json"))
                save_json(val_idx, os.path.join(str(i), "val_idx.json"))
                save_json(test_idx, os.path.join(str(i), "test_idx.json"))

                with open(os.path.join(str(i), 'train.src'), 'w') as f:
                    for item in X_train:
                        f.write("%s\n" % item)

                with open(os.path.join(str(i), 'val.src'), 'w') as f:
                    for item in X_val:
                        f.write("%s\n" % item)

                with open(os.path.join(str(i), 'test.src'), 'w') as f:
                    for item in X_test:
                        f.write("%s\n" % item)

                with open(os.path.join(str(i), 'train.trg'), 'w') as f:
                    for item in Y_train:
                        f.write("%s\n" % item)

                with open(os.path.join(str(i), 'val.trg'), 'w') as f:
                    for item in Y_val:
                        f.write("%s\n" % item)

                with open(os.path.join(str(i), 'test.trg'), 'w') as f:
                    for item in Y_test:
                        f.write("%s\n" % item)

                freqs = {"FA_V": list(AF_table_V.values()), "FA_W": list(AF_table_W.values()),
                         "FC_V": list(CF_table_V.values()), "FC_W": list(CF_table_W.values())}

                with open(os.path.join(str(i),'frequencies.pkl'), 'wb') as handle:
                    pickle.dump(freqs, handle, protocol=pickle.HIGHEST_PROTOCOL)

            i = i + 1

        return V, V_idx, W, W_idx, list(AF_table_V.values()), list(AF_table_W.values()), list(CF_table_V.values()), list(CF_table_W.values())

    V, V_idx, W, W_idx, FA_V, FA_W, FC_V, FC_W = greedy_fast(thr)

    U_items = list(U_ALL.items())

    V_items = [U_items[i][0] for i in V_idx]
    W_items = [U_items[i][0] for i in W_idx]

    """### Now generate the split based on the DBCA approach"""

    train_idx = list(V_idx)
    val_idx, test_idx = train_test_split(W_idx, test_size=0.50, random_state=42)

    train_keys = list(V_items)
    val_keys = [W_items[W_idx.index(item)] for item in val_idx]
    test_keys = [W_items[W_idx.index(item)] for item in test_idx]

    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    X_val = []
    Y_val = []

    for i in train_idx:
        X_train.append(source[i])
        Y_train.append(target[i])

    for i in val_idx:
        X_val.append(source[i])
        Y_val.append(target[i])

    for i in test_idx:
        X_test.append(source[i])
        Y_test.append(target[i])

    save_json(train_keys, "train_keys.json")
    save_json(val_keys, "val_keys.json")
    save_json(test_keys, "test_keys.json")

    save_json(train_idx, "train_idx.json")
    save_json(val_idx, "val_idx.json")
    save_json(test_idx, "test_idx.json")

    with open('train.src', 'w') as f:
        for item in X_train:
            f.write("%s\n" % item)

    with open('val.src', 'w') as f:
        for item in X_val:
            f.write("%s\n" % item)

    with open('test.src', 'w') as f:
        for item in X_test:
            f.write("%s\n" % item)

    with open('train.trg', 'w') as f:
        for item in Y_train:
            f.write("%s\n" % item)

    with open('val.trg', 'w') as f:
        for item in Y_val:
            f.write("%s\n" % item)

    with open('test.trg', 'w') as f:
        for item in Y_test:
            f.write("%s\n" % item)

    freqs = {"FA_V": FA_V, "FA_W": FA_W,
             "FC_V": FC_V, "FC_W": FC_W}

    with open('frequencies.pkl', 'wb') as handle:
        pickle.dump(freqs, handle, protocol=pickle.HIGHEST_PROTOCOL)


if __name__ == '__main__':
    print("starting split generation...\n\n")
    generate(0)
    print("completed!")