Add files via upload

shtrausslearning · Jun 27, 2023 · de9a642 · de9a642
1 parent 22ffcb5
commit de9a642
Show file tree

Hide file tree

Showing 44 changed files with 3,087 additions and 1,469 deletions.
diff --git a/build/lib/mllibs/common_eval.py b/build/lib/mllibs/common_eval.py
diff --git a/build/lib/mllibs/interface.py b/build/lib/mllibs/interface.py
@@ -15,6 +15,7 @@
 from mllibs.mtextnorm import cleantext, configure_nlptxtclean
 from mllibs.msllinear import sllinear, configure_sllinear
 from mllibs.musldimred import usldimred, configure_usldimred
+from mllibs.mslensemble import slensemble, configure_slensemble
 
 # single command interpreter, multiple command interpreter & interface (chat)
 
@@ -64,13 +65,16 @@ def exec(self,command:str,args:dict=None):
 
 class interface(snlpi,mnlpi,nlpi):
 
-    def __init__(self):
+    def __init__(self,silent=False):
 
         # compile modules
         self.collection = self.prestart()
-        snlpi.__init__(self,self.collection) 
+        snlpi.__init__(self,self.collection)
+        if(silent is False):
+            nlpi.silent = False
+        else:
+            nlpi.silent = True 
 
-
     def __getitem__(self,command:str):
         self.exec(command,args=None)
 
@@ -88,8 +92,9 @@ def prestart(self):
                          data_outliers(configure_outliers), # create data outliers
                          embedding(configure_nlpembed),    # generate text embeddings
                          cleantext(configure_nlptxtclean), # clean text 
-                         sllinear(configure_sllinear),      # linear regression models                        
-                         usldimred(configure_usldimred)
+                         sllinear(configure_sllinear),      # linear machine learning models                        
+                         usldimred(configure_usldimred),     # unsupervised learning dimension reduction
+                         slensemble(configure_slensemble)   # ensemble machine learning models
                         ])
 
 
@@ -98,38 +103,38 @@ def prestart(self):
         return collection
 
 
-    def iter_loop(self):
+    # def iter_loop(self):
 
-        # user command 
-        if(command == None):
-            print('What would you like to do?')
-            self.command = input()
-        else:
-            self.command = command
+    #     # user command 
+    #     if(command == None):
+    #         print('What would you like to do?')
+    #         self.command = input()
+    #     else:
+    #         self.command = command
 
-        ''' Check for multicommand '''
-        # currently simple implementation based on rules
+    #     ''' Check for multicommand '''
+    #     # currently simple implementation based on rules
 
-        tokens = nlpi.nltk_tokeniser(self.command)
+    #     tokens = nlpi.nltk_tokeniser(self.command)
 
-        for token in tokens:
-            if(token in text_store.dividers):
-                ctype = 'multiple'
-            else:
-                ctype = 'single'
+    #     for token in tokens:
+    #         if(token in text_store.dividers):
+    #             ctype = 'multiple'
+    #         else:
+    #             ctype = 'single'
 
-        # activate relevant interpreter
-        if(ctype == 'multiple'):
-            mnpli.__init__(self,self.collection)
-            self.exec(str(self.command))
-        elif(ctype == 'single'):
-            snlpi.__init__(self,self.collection)
-            self.exec(str(self.command))
-            self.return_data()
+    #     # activate relevant interpreter
+    #     if(ctype == 'multiple'):
+    #         mnpli.__init__(self,self.collection)
+    #         self.exec(str(self.command))
+    #     elif(ctype == 'single'):
+    #         snlpi.__init__(self,self.collection)
+    #         self.exec(str(self.command))
+    #         self.return_data()
 
 
-    def return_data(self):
-        print('storing data in global variable: stored')
-        globals()['stored'] = self.glr()
+    # def return_data(self):
+    #     print('storing data in global variable: stored')
+    #     globals()['stored'] = self.glr()
 
 
diff --git a/build/lib/mllibs/mdsplit.py b/build/lib/mllibs/mdsplit.py
@@ -4,6 +4,7 @@
 from sklearn.model_selection import KFold
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import train_test_split
+import random
 
 
 '''
@@ -20,12 +21,32 @@ class make_fold(nlpi):
     def __init__(self,nlp_config):
         self.name = 'make_folds'             
         self.nlp_config = nlp_config 
+
+    @staticmethod
+    def sfp(args,preset,key:str):
+
+        if(args[key] is not None):
+            return eval(args[key])
+        else:
+            return preset[key] 
+
+    # set general parameter
+
+    @staticmethod
+    def sgp(args,key:str):
+
+        if(args[key] is not None):
+            return eval(args[key])
+        else:
+            return None
 
     # called in nlpi
     def sel(self,args:dict):
 
+        # define instance parameters
         self.select = args['pred_task']
         self.args = args
+        self.data_name = args['data_name']  # name of the data
 
         if(self.select == 'kfold_label'):
             self.kfold_label(self.args)
@@ -34,44 +55,64 @@ def sel(self,args:dict):
         elif(self.select == 'tts_label'):
             self.tts_label(self.args)
 
-    # Kfold splitting
+    ''' 
+    
+    ACTIVATION FUNCTIONS 
+    
+    '''
+    # kfold_label 
+    # skfold_label
+    # tts_label
 
     def kfold_label(self,args:dict):
+
+        pre = {'splits':3,'shuffle':True,'rs':random.randint(1,500)}
 
-        kf = KFold(n_splits=eval(args['splits']), 
-                   shuffle=eval(args['shuffle']), 
-                   random_state=eval(args['rs']))
+        kf = KFold(n_splits=self.sfp(args,pre,'n_splits'), 
+                   shuffle=self.sfp(args,pre,'shuffle'), 
+                   random_state=self.sfp(args,pre,'rs'))
 
         for i, (_, v_ind) in enumerate(kf.split(args['data'])):
             args['data'].loc[args['data'].index[v_ind], 'kfold'] = f"fold{i+1}"
 
         # store relevant data about operation
+
         nlpi.memory_output.append({'data':args['data'],
-                                   'shuffle':args['shuffle'],
-                                   'n_splits':args['splits'],
+                                   'shuffle':self.sfp(args,pre,'shuffle'),
+                                   'n_splits':self.sfp(args,pre,'splits'),
                                    'split':kf,
-                                   'rs':args['rs']}) 
-
+                                   'rs':self.sfp(args,pre,'rs')}) 
+
+        # store split data into input data source
+
+        nlpi.data[self.data_name[0]]['splits'][f'kfold_{nlpi.iter}'] = kf
+
     # Stratified kfold splitting             
 
     def skfold_label(self,args:dict):
+
+        pre = {'splits':3,'shuffle':True,'rs':random.randint(1,500)}
 
         if(type(args['y']) is str):
 
-            kf = StratifiedKFold(n_splits=eval(args['splits']), 
-                                 shuffle=eval(args['shuffle']), 
-                                 random_state=eval(args['rs']))
+            kf = StratifiedKFold(n_splits=self.sfp(args,pre,'n_splits'), 
+                                 shuffle=self.sfp(args,pre,'shuffle'), 
+                                 random_state=self.sfp(args,pre,'rs'))
 
             for i, (_, v_ind) in enumerate(kf.split(args['data'],args['data'][[args['y']]])):
                 args['data'].loc[args['data'].index[v_ind], 'skfold'] = f"fold{i+1}"
 
             # store relevant data about operation
             nlpi.memory_output.append({'data':args['data'],
-                                       'shuffle':args['shuffle'],
-                                       'n_splits':args['splits'],
+                                       'shuffle':self.sfp(args,pre,'shuffle'),
+                                       'n_splits':self.sfp(args,pre,'splits'),
                                        'stratify':args['y'],
                                        'split':kf,
-                                       'rs':args['rs']}) 
+                                       'rs':self.sfp(args,pre,'rs')}) 
+
+            # store relevant data about operation
+            nlpi.data[self.data_name[0]]['splits'][f'skfold_{nlpi.iter}'] = kf
+
         else:
             print('specify y data token for stratification!')    
             nlpi.memory_output(None)                           
@@ -80,12 +121,15 @@ def skfold_label(self,args:dict):
     # Train test split labeling (one df only)
 
     def tts_label(self,args:dict):
+
+        # preset setting 
+        pre = {'test_size':0.3,'shuffle':True,'rs':random.randint(1,500)}
 
         train, test = train_test_split(args['data'],
-                                       test_size=eval(args['test_size']),
-                                       shuffle=eval(args['shuffle']),
+                                       test_size=self.sfp(args,pre,'test_size'),
+                                       shuffle=self.sfp(args,pre,'shuffle'),
                                        stratify=args['y'],
-                                       random_state=eval(args['rs'])
+                                       random_state=self.sfp(args,pre,'rs')
                                        )
 
         train['tts'] = 'train'
@@ -95,68 +139,75 @@ def tts_label(self,args:dict):
 
         # store relevant data about operation
         nlpi.memory_output.append({'data':ldf,
-                                   'stratified':args['y'],
-                                   'shuffle':args['shuffle'],
+                                   'shuffle':self.sfp(args,pre,'shuffle'),
                                    'stratify':args['y'],
-                                   'test_size':args['test_size'],
-                                   'rs':args['rs']}
-                                   )
+                                   'test_size':self.sfp(args,pre,'test_size'),
+                                   'rs':self.sfp(args,pre,'rs')}
+                                )
+
+        # store relevant data about operation in data source
+        nlpi.data[self.data_name[0]]['splits'][f'tts_{nlpi.iter}'] = ldf['tts']
 
 '''
 
-
 Corpus
 
-
 '''   
 
 corpus_makefold = OrderedDict({"kfold_label":['create kfold',
-                                      'make kfold'
-                                      'create subset folds',
-                                      'make subset fold',
-                                      'label kfold'],
+                                              'create kfolds',
+                                              'make kfold',
+                                              'create kfold labels',
+                                              'create subset folds',
+                                              'make subset fold',
+                                              'label kfold'],
 
                                 "skfold_label": ['stratified kfold',
-                                            'create stratified kfold',
-                                            'make stratified kfold',
-                                            'generate stratified kfold',
-                                            'label statified kfold'],
+                                                 'stratified kfolds',
+                                                 'create stratified kfold',
+                                                 'make stratified kfold',
+                                                 'generate stratified kfold',
+                                                 'label statified kfold'],
 
                                 'tts_label': ['train test split label',
-                                             'create tts label',
-                                             'make tts label',
-                                             'make train test split label',
-                                             'train-test-split label',
-                                             'create train-test-split label',
-                                             'label tts',
-                                             'tts labels',
-                                             'create tts labels']
+                                              'train test split labels',
+                                              'train test splitting labels',
+                                              'create tts label',
+                                              'make tts label',
+                                              'make train test split label',
+                                              'train-test-split label',
+                                              'create train-test-split label',
+                                              'label tts',
+                                              'tts labels',
+                                              'create tts labels']
 
                                       })
 
 
 info_makefold = {'kfold_label': {'module':'make_folds',
-                            'action':'action',
-                            'topic':'topic',
-                            'subtopic':'sub topic',
-                            'input_format':'pd.DataFrame',
-                            'description':'generate kfolds labels for dataframe',
-                            'arg_compat':'splits shuffle rs'},
+                                'action':'create subset',
+                                'topic':'subset generation',
+                                'subtopic':'kfold cross validation',
+                                'input_format':'pd.DataFrame',
+                                'description':"K-fold cross-validation is a technique used in machine learning to evaluate the performance of a model. It involves dividing the dataset into k equal-sized subsets, or folds. The model is then trained on k-1 folds and tested on the remaining fold. This process is repeated k times, with each fold being used as the test set once. The results are averaged across the k iterations to provide an estimate of the model's performance. K-fold cross-validation helps to reduce the risk of overfitting and provides a more accurate estimate of the model's generalization performance. It is commonly used in machine learning to tune hyperparameters, select models, and compare different algorithms.",
+                                'arg_compat':'splits shuffle rs'},
 
                 'skfold_label': {'module':'make_folds',
-                            'action':'action',
-                            'topic':'topic',
-                            'subtopic':'sub topic',
-                            'input_format':'pd.DataFrame',
-                            'description':'generate stratified kfolds labels for dataframe'},
+                                'action':'create subset',
+                                'topic':'subset generation',
+                                'subtopic':'stratified kfold cross validation',
+                                'input_format':'pd.DataFrame',
+                                'description':"Stratified k-fold cross-validation is a variation of k-fold cross-validation that ensures that each fold is representative of the overall distribution of the target variable. This is particularly useful when dealing with imbalanced datasets, where one class may be significantly underrepresented. In stratified k-fold cross-validation, the dataset is divided into k folds, but the division is done in such a way that each fold contains approximately the same proportion of samples from each class as the original dataset. This ensures that each fold is representative of the overall distribution of the target variable, and reduces the risk of bias in the evaluation of the model's performance. Stratified k-fold cross-validation is commonly used in classification tasks where the goal is to predict the class label of a sample based on its features.",
+                                'arg_compat':'splits shuffle rs'
+                                },
 
                 'tts_label': {'module':'make_folds',
-                            'action':'action',
-                            'topic':'topic',
-                            'subtopic':'sub topic',
-                            'input_format':'pd.DataFrame',
-                            'description':'generate train-test-split labels for dataframe'}
-
+                              'action':'create subset',
+                              'topic':'subset generation',
+                              'subtopic':'train test split',
+                              'input_format':'pd.DataFrame',
+                              'description':"Train test splitting is a technique used in machine learning to evaluate the performance of a model. It involves dividing the available dataset into two subsets: the training set and the testing set. The training set is used to train the model, while the testing set is used to evaluate its performance. The idea behind train test splitting is to assess how well the model generalizes to new, unseen data. By evaluating the model on a separate testing set, we can get an estimate of its performance on new data that it has not seen before. The size of the training and testing sets can vary depending on the size of the dataset, but a common practice is to use 70-80 of the data for training and the remaining 20-30 for testing.",
+                              'arg_compat':'test_size shuffle rs'}
 
                             }