feat (model training): Added semi-supervised learning support.

ertis-research · Jul 5, 2024 · 1986387 · 1986387
1 parent 1cdb7f4
commit 1986387
Show file tree

Hide file tree

Showing 34 changed files with 1,010 additions and 126 deletions.
diff --git a/README.md b/README.md
@@ -51,7 +51,9 @@ Kafka-ML article has been selected as
 - [Usage](#usage)
   - [Single models](#Single-models)
   - [Distributed models](#Distributed-models)
+  - [Semi-supervised learning](#Semi-supervised-learning)
   - [Incremental training](#Incremental-training)
+  - [Federated learning](#Federated-learning)
 - [Installation and development](#Installation-and-development)
   - [Requirements to build locally](#Requirements-to-build-locally)
   - [Steps to build Kafka-ML](#Steps-to-build-Kafka-ML)
@@ -78,6 +80,7 @@ Kafka-ML article has been selected as
 - [26/12/2022] Added indefinite incremental training support.
 - [07/07/2023] Added federated training support (currently only for Tensorflow/Keras models).
 - [28/09/2023] Federated learning enabled for distributed neural networks and incremental training.
+- [05/07/2024] Added semi-supervised learning support.
 
 ## Deploy Kafka-ML in a fast way
 
@@ -493,6 +496,46 @@ the topics deployed:
 python examples/MNIST_RAW_format/mnist_dataset_inference_example.py
 ```
 
+### Semi-supervised learning
+
+Semi-supervised learning is a type of machine learning that falls between supervised
+and unsupervised learning. In supervised learning, the model is trained on a labeled
+dataset, where each example is associated with a correct output or label. In unsupervised
+learning, the model is trained on an unlabeled dataset, and it must learn to identify
+patterns or structure in the data without any explicit guidance. Semi-supervised learning,
+on the other hand, involves training a machine learning model on a dataset that contains
+both labeled and unlabeled examples. The idea behind semi-supervised learning is to use
+the small amount of labeled data to guide the learning process, while also leveraging
+the much larger amount of unlabeled data to improve the model's performance.
+
+Currently, the only framework that supports semi-supervised training is TensorFlow.
+In this case, the usage example will be the same as the one presented for the
+single models, only the configuration deployment form will change and will now
+contain more fields.
+
+As before, change the fields as desired. The new semi-supervised fields are:
+unsupervised_rounds and confidence. Unsupervised rounds are used to define the number
+of rounds to iterate through the so far unlabelled data. Confidence is used to specify
+the minimum reliance that the model has to have in a prediction of an unlabelled data
+in order to subsequently assign that label to it. They are not required, so if not specified,
+default values are taken, which are: _5_ and _0.9_, respectively.
+
+<img src="images/deploy-unsupervised-configuration.png" width="500">
+
+Once the configuration is deployed, you will see one training result per model
+in the configuration. Models are now ready to be trained and receive stream
+data. Now, it is time to ingest the model(s) with your data stream for training.
+
+If you have used the MNIST model you can use the example
+`mnist_dataset_unsupervised_training_example.py`. You may need to install the Python
+libraries listed in datasources/requirements.txt.
+
+If so, please execute the incremental MNIST example for training:
+
+```
+python examples/MNIST_RAW_format/mnist_dataset_unsupervised_training_example.py
+```
+
 ### Incremental training
 
 Incremental training is a machine learning method in which input data is
@@ -523,7 +566,7 @@ in the configuration. Models are now ready to be trained and receive stream
 data. Now, it is time to ingest the model(s) with your data stream for training.
 
 If you have used the MNIST model you can use the example
-`mnist_dataset_federated_training_example.py`. You may need to install the Python
+`mnist_dataset_online_training_example.py`. You may need to install the Python
 libraries listed in datasources/requirements.txt.
 
 If so, please execute the incremental MNIST example for training:

diff --git a/backend/automl/models.py b/backend/automl/models.py
@@ -29,24 +29,34 @@ class Meta(object):
 class Deployment(models.Model):
     """Deployment of a configuration of models for training"""
 
+    # General Deployment Settings
+    batch = models.IntegerField(default=1)
+    tf_kwargs_fit = models.CharField(max_length=100, blank=True)
+    tf_kwargs_val = models.CharField(max_length=100, blank=True)
+    pth_kwargs_fit = models.CharField(max_length=100, blank=True)
+    pth_kwargs_val = models.CharField(max_length=100, blank=True)
+    conf_mat_settings = models.BooleanField(default=False, blank=True, null=True)
+    configuration = models.ForeignKey(Configuration, related_name='deployments', on_delete=models.CASCADE)
+    time = models.DateTimeField(default=now, editable=False)
+
+    # Distributed Deployment Settings
     optimizer = models.TextField(default='adam', blank=True)
     learning_rate = models.DecimalField(max_digits=7, decimal_places=6, default=0.001, blank=True)
     loss = models.TextField(default='sparse_categorical_crossentropy', blank=True)
     metrics = models.TextField(default='sparse_categorical_accuracy', blank=True)
+
+    # Incremental Deployment Settings
     incremental = models.BooleanField(default=False)
     indefinite = models.BooleanField(default=False)
     stream_timeout = models.IntegerField(default=60000, blank=True, null=True)
     monitoring_metric = models.TextField(blank=True, null=True)
     change = models.TextField(blank=True, null=True)
-    improvement = models.DecimalField(max_digits=7, decimal_places=6, blank=True, null=True, default=0.1)
-    batch = models.IntegerField(default=1)
-    tf_kwargs_fit = models.CharField(max_length=100, blank=True)
-    tf_kwargs_val = models.CharField(max_length=100, blank=True)
-    pth_kwargs_fit = models.CharField(max_length=100, blank=True)
-    pth_kwargs_val = models.CharField(max_length=100, blank=True)
-    conf_mat_settings = models.BooleanField(default=False, blank=True, null=True)
-    configuration = models.ForeignKey(Configuration, related_name='deployments', on_delete=models.CASCADE)
-    time = models.DateTimeField(default=now, editable=False)
+    improvement = models.DecimalField(max_digits=7, decimal_places=6, blank=True, null=True, default=0.05)
+
+    # Unsupervised Deployment Settings
+    unsupervised = models.BooleanField(default=False)
+    unsupervised_rounds = models.IntegerField(default=5, blank=True, null=True)
+    confidence = models.DecimalField(max_digits=7, decimal_places=6, blank=True, null=True, default=0.9)
 
     # Federated Deployment Settings
     federated = models.BooleanField(default=False)

diff --git a/backend/automl/serializers.py b/backend/automl/serializers.py
@@ -100,6 +100,7 @@ class Meta:
         model = Deployment
         fields = ['optimizer', 'learning_rate', 'loss', 'metrics']+[
                 'incremental', 'indefinite', 'stream_timeout', 'monitoring_metric', 'change', 'improvement']+[
+                'unsupervised', 'unsupervised_rounds', 'confidence']+[
                 'batch', 'tf_kwargs_fit', 'tf_kwargs_val', 'pth_kwargs_fit', 'pth_kwargs_val', 'conf_mat_settings', 'configuration']+[
                 'federated', 'agg_rounds', 'min_data', 'agg_strategy', 'data_restriction']
 
@@ -140,7 +141,10 @@ class DeploymentSerializer(serializers.ModelSerializer):
 
     class Meta:
         model = Deployment
-        fields = ['id', 'configuration', 'results', 'optimizer', 'learning_rate', 'loss', 'metrics', 'incremental', 'indefinite', 'stream_timeout', 'monitoring_metric', 'change', 'improvement']+[
+        fields = ['id', 'configuration', 'results']+[
+                'optimizer', 'learning_rate', 'loss', 'metrics']+[
+                'incremental', 'indefinite', 'stream_timeout', 'monitoring_metric', 'change', 'improvement']+[
+                'unsupervised', 'unsupervised_rounds', 'confidence']+[
                 'batch', 'tf_kwargs_fit', 'tf_kwargs_val', 'pth_kwargs_fit', 'pth_kwargs_val', 'conf_mat_settings', 'time']+[
                 'federated', 'agg_rounds', 'min_data', 'agg_strategy', 'data_restriction']