KolosalAI
diff --git a/‎framework/__init__.py‎
Lines changed: 32 additions & 6 deletions b/‎framework/__init__.py‎
Lines changed: 32 additions & 6 deletions
diff --git a/‎framework/adapters/model_adapters.py‎
Lines changed: 80 additions & 14 deletions b/‎framework/adapters/model_adapters.py‎
Lines changed: 80 additions & 14 deletions
diff --git a/‎framework/core/base_model.py‎
Lines changed: 58 additions & 10 deletions b/‎framework/core/base_model.py‎
Lines changed: 58 additions & 10 deletions
@@ -67,7 +67,7 @@ def __init__(self, config: Optional[InferenceConfig] = None):
         self.config = config
         self.model: Optional[BaseModel] = None
         self.engine: Optional[InferenceEngine] = None
-        self.model_manager = get_model_manager()
+        self.model_manager = get_model_manager  # Store the function, not call it
         self.performance_monitor = get_performance_monitor()
         self.metrics_collector = get_metrics_collector()
 
@@ -108,7 +108,7 @@ def load_model(self, model_path: Union[str, Path], model_name: Optional[str] = N
             if model_name is None:
                 model_name = Path(model_path).stem if isinstance(model_path, (str, Path)) else str(model_path)
 
-            self.model_manager.register_model(model_name, self.model)
+            self.model_manager().register_model(model_name, self.model)
 
             # Create inference engine
             self.engine = create_inference_engine(self.model, self.config)
@@ -153,7 +153,16 @@ def predict(self, inputs: Any, **kwargs) -> Any:
         if not self._initialized:
             raise RuntimeError("Model not loaded. Call load_model() first.")
 
-        return self.model.predict(inputs)
+        # Track performance
+        request_id = f"sync_{int(time.time() * 1000000)}"
+        self.performance_monitor.start_request(request_id)
+        try:
+            result = self.model.predict(inputs)
+            self.performance_monitor.end_request(request_id)
+            return result
+        except Exception as e:
+            self.performance_monitor.end_request(request_id)
+            raise
 
     async def predict_async(self, inputs: Any, priority: int = 0, 
                            timeout: Optional[float] = None, **kwargs) -> Any:
@@ -191,7 +200,24 @@ def predict_batch(self, inputs_list: List[Any], **kwargs) -> List[Any]:
         if not self._initialized:
             raise RuntimeError("Model not loaded. Call load_model() first.")
 
-        return self.model.predict_batch(inputs_list)
+        # Use the model's predict_batch method if available
+        if hasattr(self.model, 'predict_batch'):
+            return self.model.predict_batch(inputs_list)
+        
+        # Fallback to individual predictions
+        results = []
+        for i, inputs in enumerate(inputs_list):
+            request_id = f"batch_{int(time.time() * 1000000)}_{i}"
+            self.performance_monitor.start_request(request_id)
+            try:
+                result = self.model.predict(inputs)
+                results.append(result)
+                self.performance_monitor.end_request(request_id)
+            except Exception as e:
+                self.performance_monitor.end_request(request_id)
+                raise
+        
+        return results
 
     async def predict_batch_async(self, inputs_list: List[Any], priority: int = 0,
                                  timeout: Optional[float] = None, **kwargs) -> List[Any]:
@@ -343,7 +369,7 @@ async def cleanup(self) -> None:
         if self.model:
             self.model.cleanup()
 
-        self.model_manager.cleanup_all()
+        self.model_manager().cleanup_all()
 
         self.logger.info("Framework cleanup complete")
 
@@ -557,7 +583,7 @@ def load_model(self, model_path: Union[str, Path], model_name: Optional[str] = N
             if model_name is None:
                 model_name = Path(model_path).stem if isinstance(model_path, (str, Path)) else str(model_path)
 
-            self.model_manager.register_model(model_name, self.model)
+            self.model_manager().register_model(model_name, self.model)
 
             # Create inference engine
             self.engine = create_inference_engine(self.model, self.config)
 
@@ -38,7 +38,7 @@ def load_model(self, model_path: Union[str, Path]) -> None:
 
             # Load model
             if model_path.suffix == '.pt' or model_path.suffix == '.pth':
-                checkpoint = torch.load(model_path, map_location=self.device)
+                checkpoint = torch.load(model_path, map_location=self.device, weights_only=False)
 
                 # Handle different save formats
                 if isinstance(checkpoint, nn.Module):
@@ -121,8 +121,57 @@ def postprocess(self, outputs: torch.Tensor) -> Any:
             self._postprocessing_pipeline = create_default_postprocessing_pipeline(self.config)
 
         result = self._postprocessing_pipeline.auto_postprocess(outputs)
+        
+        # Convert to dict for backward compatibility
+        if hasattr(result, 'to_dict'):
+            return result.to_dict()
+        
         return result
 
+    def predict_batch(self, inputs_list: List[Any]) -> List[Any]:
+        """
+        Batch prediction optimized for PyTorch models.
+        
+        Args:
+            inputs_list: List of input data
+            
+        Returns:
+            List of predictions
+        """
+        if not inputs_list:
+            return []
+        
+        # Try to batch process if possible
+        try:
+            # Preprocess all inputs
+            preprocessed_inputs = [self.preprocess(inp) for inp in inputs_list]
+            
+            # Stack into batch tensor if possible
+            if all(isinstance(inp, torch.Tensor) and inp.shape == preprocessed_inputs[0].shape for inp in preprocessed_inputs):
+                batch_tensor = torch.stack(preprocessed_inputs, dim=0)
+                
+                # Forward pass on batch
+                with torch.no_grad():
+                    batch_outputs = self.forward(batch_tensor)
+                
+                # Split batch results and postprocess
+                if len(batch_outputs.shape) > 0:
+                    outputs_list = torch.split(batch_outputs, 1, dim=0)
+                    results = []
+                    for output in outputs_list:
+                        output = output.squeeze(0)  # Remove batch dimension
+                        result = self.postprocess(output)
+                        results.append(result)
+                    return results
+                    
+            # Fallback to individual processing
+            return [self.predict(inp) for inp in inputs_list]
+            
+        except Exception as e:
+            self.logger.warning(f"Batch processing failed: {e}, falling back to individual processing")
+            # Fallback to individual processing
+            return [self.predict(inp) for inp in inputs_list]
+    
     def _get_input_shape(self) -> Tuple[int, ...]:
         """Get model input shape."""
         try:
@@ -482,8 +531,9 @@ def create_adapter(model_path: Union[str, Path], config: InferenceConfig) -> Bas
             return ONNXModelAdapter(config)
         elif model_path.suffix in ['.trt', '.engine']:
             return TensorRTModelAdapter(config)
-        elif '/' in str(model_path) and not model_path.exists():
-            # Likely a Hugging Face model name
+        elif ('/' in str(model_path) and not model_path.exists()) or \
+             (not model_path.exists() and not model_path.suffix and '-' in str(model_path)):
+            # Likely a Hugging Face model name (contains '/' or has no extension with '-')
             return HuggingFaceModelAdapter(config)
         else:
             # Default to PyTorch
@@ -510,21 +560,37 @@ def load_model(model_path: Union[str, Path], config: Optional[InferenceConfig] =
         
     Returns:
         Loaded model adapter
+        
+    Raises:
+        ValueError: If model format is not supported
     """
     if config is None:
         from ..core.config import get_global_config
         config = get_global_config()
 
-    # Create adapter
-    adapter = ModelAdapterFactory.create_adapter(model_path, config)
-    
-    # Load model
-    adapter.load_model(model_path)
-    
-    # Optimize for inference
-    adapter.optimize_for_inference()
+    model_path = Path(model_path) if isinstance(model_path, str) else model_path
 
-    # Warmup
-    adapter.warmup()
+    # Validate model format before proceeding
+    if model_path.exists() and model_path.suffix not in ['.pt', '.pth', '.torchscript', '.onnx', '.trt', '.engine']:
+        raise ValueError(f"Unsupported model format: {model_path.suffix}")
 
-    return adapter
+    # Create adapter
+    try:
+        adapter = ModelAdapterFactory.create_adapter(model_path, config)
+        
+        # Load model
+        adapter.load_model(model_path)
+        
+        # Optimize for inference
+        adapter.optimize_for_inference()
+        
+        # Warmup
+        adapter.warmup()
+        
+        return adapter
+    except ModelLoadError as e:
+        # Convert ModelLoadError to ValueError for unsupported formats
+        if "Unsupported file extension" in str(e):
+            raise ValueError(f"Unsupported model format: {model_path}") from e
+        else:
+            raise
@@ -138,11 +138,25 @@ def predict(self, inputs: Any) -> Any:
 
             # Forward pass
             with torch.no_grad():
-                raw_outputs = self.forward(preprocessed_inputs)
+                try:
+                    raw_outputs = self.forward(preprocessed_inputs)
+                except Exception as e:
+                    # Handle compilation errors by falling back to non-compiled model
+                    if "CppCompileError" in str(e) and self._compiled_model is not None:
+                        self.logger.warning("Torch compilation failed, falling back to non-compiled model")
+                        self.config.device.use_torch_compile = False
+                        self._compiled_model = None
+                        raw_outputs = self.forward(preprocessed_inputs)
+                    else:
+                        raise
 
             # Postprocess
             predictions = self.postprocess(raw_outputs)
 
+            # Convert to dict for backward compatibility if needed
+            if hasattr(predictions, 'to_dict'):
+                return predictions.to_dict()
+            
             return predictions
 
         except Exception as e:
@@ -213,13 +227,30 @@ def warmup(self, num_iterations: int = None) -> None:
             dummy_input = self._create_dummy_input()
 
             for i in range(num_iterations):
-                with torch.no_grad():
-                    _ = self.forward(dummy_input)
+                try:
+                    with torch.no_grad():
+                        _ = self.forward(dummy_input)
+                except Exception as e:
+                    self.logger.warning(f"Warmup iteration {i+1} failed: {e}")
+                    # If first iteration fails due to compilation, disable compilation and retry
+                    if i == 0 and "CppCompileError" in str(e):
+                        self.logger.warning("Disabling torch.compile due to compilation error")
+                        self.config.device.use_torch_compile = False
+                        self._compiled_model = None
+                        try:
+                            with torch.no_grad():
+                                _ = self.forward(dummy_input)
+                        except Exception as e2:
+                            self.logger.error(f"Warmup failed even without compilation: {e2}")
+                            break
+                    else:
+                        # For other errors, just continue
+                        continue
 
             self.logger.info("Model warmup completed")
 
         except Exception as e:
-            self.logger.error(f"Warmup failed: {e}")
+            self.logger.warning(f"Warmup failed: {e}. Model may still work for inference.")
 
     def compile_model(self) -> None:
         """Compile the model using torch.compile for optimization."""
@@ -239,7 +270,8 @@ def compile_model(self) -> None:
             )
             self.logger.info("Model compilation completed")
         except Exception as e:
-            self.logger.error(f"Model compilation failed: {e}")
+            self.logger.warning(f"Model compilation failed: {e}. Continuing without compilation.")
+            # Don't raise the exception, just continue without compilation
 
     def get_model_for_inference(self) -> nn.Module:
         """Get the model instance to use for inference (compiled or original)."""
@@ -317,17 +349,33 @@ def model_info(self) -> Dict[str, Any]:
         }
 
         if self.metadata:
-            info["metadata"] = self.metadata
+            # Convert metadata to dict for compatibility
+            if hasattr(self.metadata, '__dict__'):
+                info["metadata"] = self.metadata.__dict__.copy()
+            else:
+                info["metadata"] = {
+                    "model_type": getattr(self.metadata, 'model_type', 'pytorch'),
+                    "input_shape": getattr(self.metadata, 'input_shape', None),
+                    "output_shape": getattr(self.metadata, 'output_shape', None),
+                    "num_parameters": getattr(self.metadata, 'num_parameters', None),
+                    "framework_version": getattr(self.metadata, 'framework_version', None)
+                }
 
         if self._is_loaded:
             info["memory_usage"] = self.get_memory_usage()
 
             # Model parameters count
             if self.model:
-                total_params = sum(p.numel() for p in self.model.parameters())
-                trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
-                info["total_parameters"] = total_params
-                info["trainable_parameters"] = trainable_params
+                try:
+                    # Handle both real models and Mock objects
+                    if hasattr(self.model, 'parameters') and callable(self.model.parameters):
+                        total_params = sum(p.numel() for p in self.model.parameters())
+                        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+                        info["total_parameters"] = total_params
+                        info["trainable_parameters"] = trainable_params
+                except (TypeError, AttributeError):
+                    # Skip parameter counting for Mock objects or other types
+                    pass
 
         return info