diff --git a/README.md b/README.md index 8ed70ce..46ef444 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Ultralytics Actions](https://github.com/ultralytics/yolo-ios-app/actions/workflows/format.yml/badge.svg)](https://github.com/ultralytics/yolo-ios-app/actions/workflows/format.yml) Discord Ultralytics Forums Ultralytics Reddit -Welcome to the [Ultralytics YOLO iOS App](https://apps.apple.com/us/app/idetection/id1452689527) GitHub repository! 📖 Leveraging Ultralytics' advanced [YOLOv8 object detection models](https://github.com/ultralytics/ultralytics), this app transforms your iOS device into an intelligent detection tool. Explore our guide to get started with the Ultralytics YOLO iOS App and discover the world in a new and exciting way. +Welcome to the [Ultralytics YOLO iOS App](https://apps.apple.com/us/app/idetection/id1452689527) GitHub repository! 📖 Leveraging Ultralytics' advanced [YOLO11 object detection models](https://github.com/ultralytics/ultralytics), this app transforms your iOS device into an intelligent detection tool. Explore our guide to get started with the Ultralytics YOLO iOS App and discover the world in a new and exciting way.
Ultralytics YOLO iOS App previews @@ -60,17 +60,17 @@ Ensure you have the following before you start: In Xcode, go to the project's target settings and choose your Apple Developer account under the "Signing & Capabilities" tab. -3. **Add YOLOv8 Models to the Project:** +3. **Add YOLO11 Models to the Project:** - Export CoreML INT8 models using the `ultralytics` Python package (with `pip install ultralytics`), or download them from our [GitHub release assets](https://github.com/ultralytics/yolo-ios-app/releases). You should have 5 YOLOv8 models in total. Place these in the `YOLO/Models` directory as seen in the Xcode screenshot below. + Export CoreML INT8 models using the `ultralytics` Python package (with `pip install ultralytics`), or download them from our [GitHub release assets](https://github.com/ultralytics/yolo-ios-app/releases). You should have 5 YOLO11 models in total. Place these in the `YOLO/Models` directory as seen in the Xcode screenshot below. ```python from ultralytics import YOLO - # Loop through all YOLOv8 model sizes + # Loop through all YOLO11 model sizes for size in ("n", "s", "m", "l", "x"): - # Load a YOLOv8 PyTorch model - model = YOLO(f"yolov8{size}.pt") + # Load a YOLO11 PyTorch model + model = YOLO(f"yolo11{size}.pt") # Export the PyTorch model to CoreML INT8 format with NMS layers model.export(format="coreml", int8=True, nms=True, imgsz=[640, 384]) @@ -89,7 +89,7 @@ Ensure you have the following before you start: The Ultralytics YOLO iOS App is designed to be intuitive: - **Real-Time Detection:** Launch the app and aim your camera at objects to detect them instantly. -- **Multiple AI Models:** Select from a range of Ultralytics YOLOv8 models, from YOLOv8n 'nano' to YOLOv8x 'x-large'. +- **Multiple AI Models:** Select from a range of Ultralytics YOLO11 models, from YOLO11n 'nano' to YOLO11x 'x-large'. ## 💡 Contribute diff --git a/YOLO.xcodeproj/project.pbxproj b/YOLO.xcodeproj/project.pbxproj index 68916e9..b924a39 100644 --- a/YOLO.xcodeproj/project.pbxproj +++ b/YOLO.xcodeproj/project.pbxproj @@ -13,16 +13,25 @@ 636EFCAF21E62DD300DE43BC /* VideoCapture.swift in Sources */ = {isa = PBXBuildFile; fileRef = 636EFCA221E62DD300DE43BC /* VideoCapture.swift */; }; 636EFCB321E62DD300DE43BC /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 636EFCA721E62DD300DE43BC /* AppDelegate.swift */; }; 636EFCB921E62E3900DE43BC /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 636EFCB821E62E3900DE43BC /* Assets.xcassets */; }; - 6381D2182B7817C200ABA4E8 /* yolov8l.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 6381D2132B7817C200ABA4E8 /* yolov8l.mlpackage */; }; - 6381D2192B7817C200ABA4E8 /* yolov8x.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 6381D2142B7817C200ABA4E8 /* yolov8x.mlpackage */; }; - 6381D21A2B7817C200ABA4E8 /* yolov8s.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 6381D2152B7817C200ABA4E8 /* yolov8s.mlpackage */; }; - 6381D21B2B7817C200ABA4E8 /* yolov8m.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 6381D2162B7817C200ABA4E8 /* yolov8m.mlpackage */; }; - 6381D21C2B7817C200ABA4E8 /* yolov8n.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 6381D2172B7817C200ABA4E8 /* yolov8n.mlpackage */; }; 63CF371F2514455300E2DEA1 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 6323C44D22186177008AE681 /* LaunchScreen.storyboard */; }; 63CF37202514455300E2DEA1 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 6323C44F22186177008AE681 /* Main.storyboard */; }; 63CF37212514455300E2DEA1 /* ultralytics_yolo_logotype.png in Resources */ = {isa = PBXBuildFile; fileRef = 6323C45122186177008AE681 /* ultralytics_yolo_logotype.png */; }; + 733FEE4F2CF2D77600C0D4E9 /* yolo11s.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE4D2CF2D77600C0D4E9 /* yolo11s.mlpackage */; }; + 733FEE502CF2D77600C0D4E9 /* yolo11m.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE4B2CF2D77600C0D4E9 /* yolo11m.mlpackage */; }; + 733FEE512CF2D77600C0D4E9 /* yolo11n.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE4C2CF2D77600C0D4E9 /* yolo11n.mlpackage */; }; + 733FEE522CF2D77600C0D4E9 /* yolo11x.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE4E2CF2D77600C0D4E9 /* yolo11x.mlpackage */; }; + 733FEE532CF2D77600C0D4E9 /* yolo11l.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE4A2CF2D77600C0D4E9 /* yolo11l.mlpackage */; }; + 733FEE552CF2DB6500C0D4E9 /* Classify.swift in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE542CF2DB6500C0D4E9 /* Classify.swift */; }; + 733FEE572CF357A900C0D4E9 /* yolo11n-cls.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE562CF357A900C0D4E9 /* yolo11n-cls.mlpackage */; }; + 733FEE592CF3589A00C0D4E9 /* Detect.swift in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE582CF3589A00C0D4E9 /* Detect.swift */; }; + 733FEE5B2CF4BFA400C0D4E9 /* Segment.swift in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE5A2CF4BFA400C0D4E9 /* Segment.swift */; }; + 733FEE5D2CF5108C00C0D4E9 /* CoreML.swift in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE5C2CF5108C00C0D4E9 /* CoreML.swift */; }; + 733FEE5F2CF579EE00C0D4E9 /* yolo11n-seg.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE5E2CF579EE00C0D4E9 /* yolo11n-seg.mlpackage */; }; + 733FEE632CF57A2200C0D4E9 /* yolo11n-pose.mlpackage in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE612CF57A2200C0D4E9 /* yolo11n-pose.mlpackage */; }; + 733FEE652CF6D65A00C0D4E9 /* Pose.swift in Sources */ = {isa = PBXBuildFile; fileRef = 733FEE642CF6D65A00C0D4E9 /* Pose.swift */; }; 8EDAA33950796844333D60A7 /* BoundingBoxView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8EDAA633C1F2B50286D16008 /* BoundingBoxView.swift */; }; /* End PBXBuildFile section */ + /* Begin PBXFileReference section */ 6323C44D22186177008AE681 /* LaunchScreen.storyboard */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; path = LaunchScreen.storyboard; sourceTree = ""; }; 6323C44F22186177008AE681 /* Main.storyboard */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; path = Main.storyboard; sourceTree = ""; }; @@ -34,12 +43,20 @@ 636EFCA221E62DD300DE43BC /* VideoCapture.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VideoCapture.swift; sourceTree = ""; }; 636EFCA721E62DD300DE43BC /* AppDelegate.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = ""; }; 636EFCB821E62E3900DE43BC /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; - 6381D2132B7817C200ABA4E8 /* yolov8l.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolov8l.mlpackage; sourceTree = ""; }; - 6381D2142B7817C200ABA4E8 /* yolov8x.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolov8x.mlpackage; sourceTree = ""; }; - 6381D2152B7817C200ABA4E8 /* yolov8s.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolov8s.mlpackage; sourceTree = ""; }; - 6381D2162B7817C200ABA4E8 /* yolov8m.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolov8m.mlpackage; sourceTree = ""; }; - 6381D2172B7817C200ABA4E8 /* yolov8n.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolov8n.mlpackage; sourceTree = ""; }; 63B8B0A821E62A890026FBC3 /* .gitignore */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = .gitignore; sourceTree = ""; }; + 733FEE4A2CF2D77600C0D4E9 /* yolo11l.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolo11l.mlpackage; sourceTree = ""; }; + 733FEE4B2CF2D77600C0D4E9 /* yolo11m.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolo11m.mlpackage; sourceTree = ""; }; + 733FEE4C2CF2D77600C0D4E9 /* yolo11n.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolo11n.mlpackage; sourceTree = ""; }; + 733FEE4D2CF2D77600C0D4E9 /* yolo11s.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolo11s.mlpackage; sourceTree = ""; }; + 733FEE4E2CF2D77600C0D4E9 /* yolo11x.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = yolo11x.mlpackage; sourceTree = ""; }; + 733FEE542CF2DB6500C0D4E9 /* Classify.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Classify.swift; sourceTree = ""; }; + 733FEE562CF357A900C0D4E9 /* yolo11n-cls.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = "yolo11n-cls.mlpackage"; sourceTree = ""; }; + 733FEE582CF3589A00C0D4E9 /* Detect.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Detect.swift; sourceTree = ""; }; + 733FEE5A2CF4BFA400C0D4E9 /* Segment.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Segment.swift; sourceTree = ""; }; + 733FEE5C2CF5108C00C0D4E9 /* CoreML.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CoreML.swift; sourceTree = ""; }; + 733FEE5E2CF579EE00C0D4E9 /* yolo11n-seg.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = "yolo11n-seg.mlpackage"; sourceTree = ""; }; + 733FEE612CF57A2200C0D4E9 /* yolo11n-pose.mlpackage */ = {isa = PBXFileReference; lastKnownFileType = folder.mlpackage; path = "yolo11n-pose.mlpackage"; sourceTree = ""; }; + 733FEE642CF6D65A00C0D4E9 /* Pose.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Pose.swift; sourceTree = ""; }; 7BCB411721C3096100BFC4D0 /* YOLO.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = YOLO.app; sourceTree = BUILT_PRODUCTS_DIR; }; 8EDAA633C1F2B50286D16008 /* BoundingBoxView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BoundingBoxView.swift; sourceTree = ""; }; 8EDAAA4507D2D23D7FAB827F /* README.md */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = ""; }; @@ -61,6 +78,11 @@ children = ( 636166E9251443B20054FA7E /* ThresholdProvider.swift */, 8EDAA633C1F2B50286D16008 /* BoundingBoxView.swift */, + 733FEE5C2CF5108C00C0D4E9 /* CoreML.swift */, + 733FEE582CF3589A00C0D4E9 /* Detect.swift */, + 733FEE542CF2DB6500C0D4E9 /* Classify.swift */, + 733FEE5A2CF4BFA400C0D4E9 /* Segment.swift */, + 733FEE642CF6D65A00C0D4E9 /* Pose.swift */, ); path = Utilities; sourceTree = ""; @@ -86,11 +108,14 @@ 63A946D8271800E20001C3ED /* Models */ = { isa = PBXGroup; children = ( - 6381D2132B7817C200ABA4E8 /* yolov8l.mlpackage */, - 6381D2162B7817C200ABA4E8 /* yolov8m.mlpackage */, - 6381D2172B7817C200ABA4E8 /* yolov8n.mlpackage */, - 6381D2152B7817C200ABA4E8 /* yolov8s.mlpackage */, - 6381D2142B7817C200ABA4E8 /* yolov8x.mlpackage */, + 733FEE4A2CF2D77600C0D4E9 /* yolo11l.mlpackage */, + 733FEE4B2CF2D77600C0D4E9 /* yolo11m.mlpackage */, + 733FEE4C2CF2D77600C0D4E9 /* yolo11n.mlpackage */, + 733FEE4D2CF2D77600C0D4E9 /* yolo11s.mlpackage */, + 733FEE4E2CF2D77600C0D4E9 /* yolo11x.mlpackage */, + 733FEE562CF357A900C0D4E9 /* yolo11n-cls.mlpackage */, + 733FEE5E2CF579EE00C0D4E9 /* yolo11n-seg.mlpackage */, + 733FEE612CF57A2200C0D4E9 /* yolo11n-pose.mlpackage */, ); path = Models; sourceTree = ""; @@ -209,15 +234,23 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( - 6381D21B2B7817C200ABA4E8 /* yolov8m.mlpackage in Sources */, - 6381D21C2B7817C200ABA4E8 /* yolov8n.mlpackage in Sources */, 636EFCAF21E62DD300DE43BC /* VideoCapture.swift in Sources */, + 733FEE4F2CF2D77600C0D4E9 /* yolo11s.mlpackage in Sources */, + 733FEE502CF2D77600C0D4E9 /* yolo11m.mlpackage in Sources */, + 733FEE5B2CF4BFA400C0D4E9 /* Segment.swift in Sources */, + 733FEE512CF2D77600C0D4E9 /* yolo11n.mlpackage in Sources */, + 733FEE522CF2D77600C0D4E9 /* yolo11x.mlpackage in Sources */, + 733FEE552CF2DB6500C0D4E9 /* Classify.swift in Sources */, + 733FEE5F2CF579EE00C0D4E9 /* yolo11n-seg.mlpackage in Sources */, + 733FEE632CF57A2200C0D4E9 /* yolo11n-pose.mlpackage in Sources */, + 733FEE572CF357A900C0D4E9 /* yolo11n-cls.mlpackage in Sources */, + 733FEE532CF2D77600C0D4E9 /* yolo11l.mlpackage in Sources */, + 733FEE592CF3589A00C0D4E9 /* Detect.swift in Sources */, 636166EA251443B20054FA7E /* ThresholdProvider.swift in Sources */, - 6381D2182B7817C200ABA4E8 /* yolov8l.mlpackage in Sources */, - 6381D21A2B7817C200ABA4E8 /* yolov8s.mlpackage in Sources */, - 6381D2192B7817C200ABA4E8 /* yolov8x.mlpackage in Sources */, 636EFCB321E62DD300DE43BC /* AppDelegate.swift in Sources */, 636EFCAA21E62DD300DE43BC /* ViewController.swift in Sources */, + 733FEE652CF6D65A00C0D4E9 /* Pose.swift in Sources */, + 733FEE5D2CF5108C00C0D4E9 /* CoreML.swift in Sources */, 8EDAA33950796844333D60A7 /* BoundingBoxView.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; @@ -354,12 +387,12 @@ INFOPLIST_FILE = YOLO/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = "Ultralytics YOLO"; INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.developer-tools"; - IPHONEOS_DEPLOYMENT_TARGET = 14.0; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", ); - MARKETING_VERSION = 8.2.0; + MARKETING_VERSION = 8.3.0; PRODUCT_BUNDLE_IDENTIFIER = com.ultralytics.iDetection; PRODUCT_NAME = "$(TARGET_NAME)"; SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; @@ -382,12 +415,12 @@ INFOPLIST_FILE = YOLO/Info.plist; INFOPLIST_KEY_CFBundleDisplayName = "Ultralytics YOLO"; INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.developer-tools"; - IPHONEOS_DEPLOYMENT_TARGET = 14.0; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", ); - MARKETING_VERSION = 8.2.0; + MARKETING_VERSION = 8.3.0; PRODUCT_BUNDLE_IDENTIFIER = com.ultralytics.iDetection; PRODUCT_NAME = "$(TARGET_NAME)"; SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; diff --git a/YOLO/Info.plist b/YOLO/Info.plist index a7022ec..308e4be 100644 --- a/YOLO/Info.plist +++ b/YOLO/Info.plist @@ -21,7 +21,7 @@ CFBundleShortVersionString $(MARKETING_VERSION) CFBundleVersion - 25 + 117 ITSAppUsesNonExemptEncryption LSRequiresIPhoneOS diff --git a/YOLO/Main.storyboard b/YOLO/Main.storyboard index 549bc72..cf599d6 100644 --- a/YOLO/Main.storyboard +++ b/YOLO/Main.storyboard @@ -1,9 +1,9 @@ - + - + @@ -32,7 +32,7 @@ - diff --git a/YOLO/Utilities/Classify.swift b/YOLO/Utilities/Classify.swift new file mode 100644 index 0000000..81138f0 --- /dev/null +++ b/YOLO/Utilities/Classify.swift @@ -0,0 +1,93 @@ +// +// Classify.swift +// YOLO +// + +import Foundation +import UIKit +import Vision + +extension ViewController { + // view + func setupClassifyOverlay() { + + classifyOverlay = UILabel( + frame: CGRect(x: view.center.x - 100, y: view.center.y - 50, width: 200, height: 100)) + + classifyOverlay.backgroundColor = UIColor.black.withAlphaComponent(0.5) + classifyOverlay.clipsToBounds = true + classifyOverlay.layer.cornerRadius = 8 + classifyOverlay.numberOfLines = 2 + classifyOverlay.textAlignment = .left + view.addSubview(classifyOverlay) + classifyOverlay.isHidden = true + } + + func showClassifyUI() { + taskSegmentControl.selectedSegmentIndex = 1 + modelSegmentedControl.selectedSegmentIndex = 0 + classifyOverlay.isHidden = false + } + + func updateClassifyOverlay() { + + classifyOverlay.frame = CGRect( + x: view.center.x - 100, y: view.center.y - 50, width: 200, height: 100) + } + // post process + + func postProcessClassify(request: VNRequest) { + if let observation = visionRequest.results as? [VNCoreMLFeatureValueObservation] { + + // Get the MLMultiArray from the observation + let multiArray = observation.first?.featureValue.multiArrayValue + + if let multiArray = multiArray { + // Initialize an array to store the classes + var valuesArray = [Double]() + + // Loop through the MLMultiArray and append its values to the array + for i in 0.. $1.value }) + + var recognitions: [[String: Any]] = [] + for (index, value) in sortedMap { + let label = self.classifyLabels[index] + recognitions.append([ + "label": label, + "confidence": value, + "index": index, + ]) + } + print(recognitions) + } + } else if let observations = request.results as? [VNClassificationObservation] { + + var recognitions: [[String: Any]] = [] + + // Convert each VNClassificationObservation into the desired format + guard let topResult = observations.first else { return } + let label = topResult.identifier // Class label + let confidence = topResult.confidence // Confidence score (between 0 and 1) + let percentageValue = confidence * 100 + let formattedPercentage = round(percentageValue * 10) / 10 + + let resultText = " \(label)\n \(formattedPercentage) %" + DispatchQueue.main.async { + self.classifyOverlay.text = resultText + } + + } + } +} diff --git a/YOLO/Utilities/CoreML.swift b/YOLO/Utilities/CoreML.swift new file mode 100644 index 0000000..5cf6c1e --- /dev/null +++ b/YOLO/Utilities/CoreML.swift @@ -0,0 +1,151 @@ +// +// ModelSelect.swift +// YOLO +// +// Created by 間嶋大輔 on 2024/11/26. +// Copyright © 2024 Ultralytics. All rights reserved. +// + +import Foundation +import UIKit +import Vision + +extension ViewController { + func predict(sampleBuffer: CMSampleBuffer) { + if currentBuffer == nil, let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) { + currentBuffer = pixelBuffer + if !frameSizeCaptured { + let frameWidth = CGFloat(CVPixelBufferGetWidth(pixelBuffer)) + let frameHeight = CGFloat(CVPixelBufferGetHeight(pixelBuffer)) + longSide = max(frameWidth, frameHeight) + shortSide = min(frameWidth, frameHeight) + frameSizeCaptured = true + } + /// - Tag: MappingOrientation + // The frame is always oriented based on the camera sensor, + // so in most cases Vision needs to rotate it for the model to work as expected. + let imageOrientation: CGImagePropertyOrientation + switch UIDevice.current.orientation { + case .portrait: + imageOrientation = .up + case .portraitUpsideDown: + imageOrientation = .down + case .landscapeLeft: + imageOrientation = .up + case .landscapeRight: + imageOrientation = .up + case .unknown: + imageOrientation = .up + default: + imageOrientation = .up + } + + // Invoke a VNRequestHandler with that image + let handler = VNImageRequestHandler( + cvPixelBuffer: pixelBuffer, orientation: imageOrientation, options: [:]) + if UIDevice.current.orientation != .faceUp { // stop if placed down on a table + t0 = CACurrentMediaTime() // inference start + do { + try handler.perform([visionRequest]) + } catch { + print(error) + } + t1 = CACurrentMediaTime() - t0 // inference dt + } + + currentBuffer = nil + } + } + + func processObservations(for request: VNRequest, error: Error?) { + DispatchQueue.main.async { + switch self.task { + case .detect: + self.postProcessDetect(request: request) + case .classify: + self.postProcessClassify(request: request) + case .segment: + self.postProcessSegment(request: request) + case .pose: + self.postProcessPose(request: request) + default: + break + } + // Measure FPS + if self.t1 < 10.0 { // valid dt + self.t2 = self.t1 * 0.05 + self.t2 * 0.95 // smoothed inference time + } + self.t4 = (CACurrentMediaTime() - self.t3) * 0.05 + self.t4 * 0.95 // smoothed delivered FPS + self.labelFPS.text = String(format: "%.1f FPS - %.1f ms", 1 / self.t4, self.t2 * 1000) // t2 seconds to ms + self.t3 = CACurrentMediaTime() + } + } + + func setModel() { + + /// Switch model + switch task { + case .detect: + switch modelSegmentedControl.selectedSegmentIndex { + case 0: + self.labelName.text = "YOLOv11n" + mlModel = try! yolo11n(configuration: .init()).model + case 1: + self.labelName.text = "YOLOv11s" + mlModel = try! yolo11s(configuration: .init()).model + case 2: + self.labelName.text = "YOLOv11m" + mlModel = try! yolo11m(configuration: .init()).model + case 3: + self.labelName.text = "YOLOv11l" + mlModel = try! yolo11l(configuration: .init()).model + case 4: + self.labelName.text = "YOLOv11x" + mlModel = try! yolo11x(configuration: .init()).model + default: + break + } + case .classify: + switch modelSegmentedControl.selectedSegmentIndex { + case 0: + self.labelName.text = "YOLO11n" + mlModel = try! yolo11n_cls(configuration: .init()).model + default: break + } + case .segment: + switch modelSegmentedControl.selectedSegmentIndex { + case 0: + self.labelName.text = "YOLO11n" + mlModel = try! yolo11n_seg(configuration: .init()).model + default: break + } + case .pose: + switch modelSegmentedControl.selectedSegmentIndex { + case 0: + self.labelName.text = "YOLO11n" + mlModel = try! yolo11n_pose(configuration: .init()).model + default: break + } + default: + break + } + DispatchQueue.global(qos: .userInitiated).async { [self] in + + /// VNCoreMLModel + detector = try! VNCoreMLModel(for: mlModel) + detector.featureProvider = ThresholdProvider() + + /// VNCoreMLRequest + let request = VNCoreMLRequest( + model: detector, + completionHandler: { [weak self] request, error in + self?.processObservations(for: request, error: error) + }) + request.imageCropAndScaleOption = .scaleFill // .scaleFit, .scaleFill, .centerCrop + visionRequest = request + t2 = 0.0 // inference dt smoothed + t3 = CACurrentMediaTime() // FPS start + t4 = 0.0 // FPS dt smoothed + } + } +} diff --git a/YOLO/Utilities/Detect.swift b/YOLO/Utilities/Detect.swift new file mode 100644 index 0000000..658bfa3 --- /dev/null +++ b/YOLO/Utilities/Detect.swift @@ -0,0 +1,216 @@ +// +// Detect.swift +// YOLO + +import Foundation +import UIKit +import Vision + +extension ViewController { + func postProcessDetect(request: VNRequest) { + if let results = request.results as? [VNRecognizedObjectObservation] { + var predictions = [DetectionResult]() + for result in results { + let prediction = DetectionResult( + rect: result.boundingBox, label: result.labels[0].identifier, + confidence: result.labels[0].confidence) + predictions.append(prediction) + } + self.showBoundingBoxes(predictions: predictions) + } else { + self.showBoundingBoxes(predictions: []) + } + } + + func showBoundingBoxes(predictions: [DetectionResult]) { + var str = "" + // date + let date = Date() + let calendar = Calendar.current + let hour = calendar.component(.hour, from: date) + let minutes = calendar.component(.minute, from: date) + let seconds = calendar.component(.second, from: date) + let nanoseconds = calendar.component(.nanosecond, from: date) + let sec_day = + Double(hour) * 3600.0 + Double(minutes) * 60.0 + Double(seconds) + Double(nanoseconds) / 1E9 // seconds in the day + + self.labelSlider.text = + String(predictions.count) + " items (max " + String(Int(slider.value)) + ")" + let width = videoPreview.bounds.width // 375 pix + let height = videoPreview.bounds.height // 812 pix + + if UIDevice.current.orientation == .portrait { + + // ratio = videoPreview AR divided by sessionPreset AR + var ratio: CGFloat = 1.0 + if videoCapture.captureSession.sessionPreset == .photo { + ratio = (height / width) / (4.0 / 3.0) // .photo + } else { + ratio = (height / width) / (16.0 / 9.0) // .hd4K3840x2160, .hd1920x1080, .hd1280x720 etc. + } + + for i in 0..= 1 { // iPhone ratio = 1.218 + let offset = (1 - ratio) * (0.5 - rect.minX) + if task == .detect { + let transform = CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: offset, y: -1) + rect = rect.applying(transform) + } else { + let transform = CGAffineTransform(translationX: offset, y: 0) + rect = rect.applying(transform) + } + rect.size.width *= ratio + } else { // iPad ratio = 0.75 + let offset = (ratio - 1) * (0.5 - rect.maxY) + let transform = CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: 0, y: offset - 1) + rect = rect.applying(transform) + ratio = (height / width) / (3.0 / 4.0) + rect.size.height /= ratio + } + + // Scale normalized to pixels [375, 812] [width, height] + rect = VNImageRectForNormalizedRect(rect, Int(width), Int(height)) + + // The labels array is a list of VNClassificationObservation objects, + // with the highest scoring class first in the list. + let bestClass = prediction.label + let confidence = prediction.confidence + // print(confidence, rect) // debug (confidence, xywh) with xywh origin top left (pixels) + let label = String(format: "%@ %.1f", bestClass, confidence * 100) + let alpha = CGFloat((confidence - 0.2) / (1.0 - 0.2) * 0.9) + // Show the bounding box. + boundingBoxViews[i].show( + frame: rect, + label: label, + color: colors[bestClass] ?? UIColor.white, + alpha: alpha) // alpha 0 (transparent) to 1 (opaque) for conf threshold 0.2 to 1.0) + + if developerMode { + // Write + if save_detections { + str += String( + format: "%.3f %.3f %.3f %@ %.2f %.1f %.1f %.1f %.1f\n", + sec_day, freeSpace(), UIDevice.current.batteryLevel, bestClass, confidence, + rect.origin.x, rect.origin.y, rect.size.width, rect.size.height) + } + } + } else { + boundingBoxViews[i].hide() + } + } + } else { + let frameAspectRatio = longSide / shortSide + let viewAspectRatio = width / height + var scaleX: CGFloat = 1.0 + var scaleY: CGFloat = 1.0 + var offsetX: CGFloat = 0.0 + var offsetY: CGFloat = 0.0 + + if frameAspectRatio > viewAspectRatio { + scaleY = height / shortSide + scaleX = scaleY + offsetX = (longSide * scaleX - width) / 2 + } else { + scaleX = width / longSide + scaleY = scaleX + offsetY = (shortSide * scaleY - height) / 2 + } + + for i in 0.. [(CGRect, Float, [Float])] + { + let numAnchors = prediction.shape[2].intValue + let featureCount = prediction.shape[1].intValue - 5 + var boxes = [CGRect]() + var scores = [Float]() + var features = [[Float]]() + let featurePointer = UnsafeMutablePointer(OpaquePointer(prediction.dataPointer)) + let lock = DispatchQueue(label: "com.example.lock") + + DispatchQueue.concurrentPerform(iterations: numAnchors) { j in + let confIndex = 4 * numAnchors + j + let confidence = featurePointer[confIndex] + + if confidence > confidenceThreshold { + let x = featurePointer[j] + let y = featurePointer[numAnchors + j] + let width = featurePointer[2 * numAnchors + j] + let height = featurePointer[3 * numAnchors + j] + + let boxWidth = CGFloat(width) + let boxHeight = CGFloat(height) + let boxX = CGFloat(x - width / 2) + let boxY = CGFloat(y - height / 2) + + let boundingBox = CGRect(x: boxX, y: boxY, width: boxWidth, height: boxHeight) + + var boxFeatures = [Float](repeating: 0, count: featureCount) + for k in 0..= confThreshold + && box.contains(CGPoint(x: CGFloat(keypoints[i * 3]), y: CGFloat(keypoints[i * 3 + 1]))) + { + points[i] = (point, conf) + + drawCircle(on: layer, at: point, radius: radius, color: kptColorIndices[i]) + } + } + + if drawSkeleton { + for (index, bone) in skeleton.enumerated() { + let (startIdx, endIdx) = (bone[0] - 1, bone[1] - 1) + + guard startIdx < points.count, endIdx < points.count else { + print("Invalid skeleton indices: \(startIdx), \(endIdx)") + continue + } + + let startPoint = points[startIdx].0 + let endPoint = points[endIdx].0 + let startConf = points[startIdx].1 + let endConf = points[endIdx].1 + + if startConf >= confThreshold && endConf >= confThreshold { + drawLine(on: layer, from: startPoint, to: endPoint, color: limbColorIndices[index]) + } + } + } + } + + func drawCircle(on layer: CALayer, at point: CGPoint, radius: CGFloat, color index: Int) { + let circleLayer = CAShapeLayer() + circleLayer.path = + UIBezierPath( + arcCenter: point, + radius: radius, + startAngle: 0, + endAngle: .pi * 2, + clockwise: true + ).cgPath + + let color = posePalette[index].map { $0 / 255.0 } + circleLayer.fillColor = + UIColor(red: color[0], green: color[1], blue: color[2], alpha: 1.0).cgColor + + layer.addSublayer(circleLayer) + } + + func drawLine(on layer: CALayer, from start: CGPoint, to end: CGPoint, color index: Int) { + let lineLayer = CAShapeLayer() + let path = UIBezierPath() + path.move(to: start) + path.addLine(to: end) + + lineLayer.path = path.cgPath + lineLayer.lineWidth = 2 + + let color = posePalette[index].map { $0 / 255.0 } + lineLayer.strokeColor = + UIColor(red: color[0], green: color[1], blue: color[2], alpha: 1.0).cgColor + + layer.addSublayer(lineLayer) + } +} + +let posePalette: [[CGFloat]] = [ + [255, 128, 0], + [255, 153, 51], + [255, 178, 102], + [230, 230, 0], + [255, 153, 255], + [153, 204, 255], + [255, 102, 255], + [255, 51, 255], + [102, 178, 255], + [51, 153, 255], + [255, 153, 153], + [255, 102, 102], + [255, 51, 51], + [153, 255, 153], + [102, 255, 102], + [51, 255, 51], + [0, 255, 0], + [0, 0, 255], + [255, 0, 0], + [255, 255, 255], +] + +let limbColorIndices = [0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16] +let kptColorIndices = [16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0] + +let skeleton = [ + [16, 14], + [14, 12], + [17, 15], + [15, 13], + [12, 13], + [6, 12], + [7, 13], + [6, 7], + [6, 8], + [7, 9], + [8, 10], + [9, 11], + [2, 3], + [1, 2], + [1, 3], + [2, 4], + [3, 5], + [4, 6], + [5, 7], +] diff --git a/YOLO/Utilities/Segment.swift b/YOLO/Utilities/Segment.swift new file mode 100644 index 0000000..fc29cfa --- /dev/null +++ b/YOLO/Utilities/Segment.swift @@ -0,0 +1,379 @@ +// +// Segment.swift +// YOLO +// + +import Accelerate +import Foundation +import UIKit +import Vision + +extension ViewController { + func setupSegmentPoseOverlay() { + let width = videoPreview.bounds.width + let height = videoPreview.bounds.height + + var ratio: CGFloat = 1.0 + if videoCapture.captureSession.sessionPreset == .photo { + ratio = (4.0 / 3.0) + } else { + ratio = (16.0 / 9.0) + } + var offSet = CGFloat.zero + var margin = CGFloat.zero + if view.bounds.width < view.bounds.height { + offSet = height / ratio + margin = (offSet - self.videoPreview.bounds.width) / 2 + self.segmentPoseOverlay.frame = CGRect( + x: -margin, y: 0, width: offSet, height: self.videoPreview.bounds.height) + } else { + offSet = width / ratio + margin = (offSet - self.videoPreview.bounds.height) / 2 + self.segmentPoseOverlay.frame = CGRect( + x: 0, y: -margin, width: self.videoPreview.bounds.width, height: offSet) + + } + var count = 0 + for _ in colors { + let color = ultralyticsColorsolors[count] + count += 1 + if count > 19 { + count = 0 + } + guard let colorForMask = color.toRGBComponents() else { fatalError() } + colorsForMasks.append(colorForMask) + } + } + + func postProcessSegment(request: VNRequest) { + if let results = request.results as? [VNCoreMLFeatureValueObservation] { + DispatchQueue.main.async { [self] in + guard results.count == 2 else { return } + let masks = results[0].featureValue.multiArrayValue + let pred = results[1].featureValue.multiArrayValue + let a = Date() + + let processed = getBoundingBoxesAndMasks( + feature: pred!, confidenceThreshold: 0.25, iouThreshold: 0.4) + var predictions = [DetectionResult]() + for object in processed { + let box = object.0 + let rect = CGRect( + x: box.minX / 640, y: box.minY / 640, width: box.width / 640, height: box.height / 640) + let bestClass = classes[object.1] + let confidence = object.2 + let prediction = DetectionResult(rect: rect, label: bestClass, confidence: confidence) + predictions.append(prediction) + } + self.showBoundingBoxes(predictions: predictions) + self.updateMaskAndBoxes(detectedObjects: processed, maskArray: masks!) + } + } + } + + func getBoundingBoxesAndMasks( + feature: MLMultiArray, confidenceThreshold: Float, iouThreshold: Float + ) -> [(CGRect, Int, Float, MLMultiArray)] { + let numAnchors = feature.shape[2].intValue + let numFeatures = feature.shape[1].intValue + let boxFeatureLength = 4 + let maskConfidenceLength = 32 + let numClasses = numFeatures - boxFeatureLength - maskConfidenceLength + + var results = [(CGRect, Int, Float, MLMultiArray)]() + let featurePointer = feature.dataPointer.assumingMemoryBound(to: Float.self) + + let resultsQueue = DispatchQueue(label: "resultsQueue", attributes: .concurrent) + + DispatchQueue.concurrentPerform(iterations: numAnchors) { j in + let baseOffset = j + let x = featurePointer[baseOffset] + let y = featurePointer[numAnchors + baseOffset] + let width = featurePointer[2 * numAnchors + baseOffset] + let height = featurePointer[3 * numAnchors + baseOffset] + + let boxWidth = CGFloat(width) + let boxHeight = CGFloat(height) + let boxX = CGFloat(x - width / 2) + let boxY = CGFloat(y - height / 2) + + let boundingBox = CGRect(x: boxX, y: boxY, width: boxWidth, height: boxHeight) + + var classProbs = [Float](repeating: 0, count: numClasses) + classProbs.withUnsafeMutableBufferPointer { classProbsPointer in + vDSP_mtrans( + featurePointer + 4 * numAnchors + baseOffset, numAnchors, classProbsPointer.baseAddress!, + 1, 1, vDSP_Length(numClasses)) + } + var maxClassValue: Float = 0 + var maxClassIndex: vDSP_Length = 0 + vDSP_maxvi(classProbs, 1, &maxClassValue, &maxClassIndex, vDSP_Length(numClasses)) + + if maxClassValue > confidenceThreshold { + let maskProbsPointer = featurePointer + (4 + numClasses) * numAnchors + baseOffset + let maskProbs = try! MLMultiArray( + shape: [NSNumber(value: maskConfidenceLength)], dataType: .float32) + for i in 0.. $1.0.size.width * $1.0.size.height + } + + var newLayers: [CALayer] = [] + + for (box, classIndex, _, masksIn) in sortedObjects { + group.enter() + DispatchQueue.global(qos: .userInitiated).async { + if let maskImage = self.generateColoredMaskImage( + from: masksIn, protos: maskArray, in: self.segmentPoseOverlay.bounds.size, + colorIndex: classIndex, + boundingBox: box) + { + let adjustedBox = self.adjustBox(box, toFitIn: self.segmentPoseOverlay.bounds.size) + + let maskImageLayer = CALayer() + maskImageLayer.frame = adjustedBox + maskImageLayer.contents = maskImage + maskImageLayer.opacity = 0.5 + DispatchQueue.main.async { + newLayers.append(maskImageLayer) + } + } + group.leave() + } + } + + // 全タスクの終了を待つ + group.notify(queue: .main) { [weak self] in + guard let self = self else { return } + self.removeAllMaskSubLayers() + newLayers.forEach { self.segmentPoseOverlay.addSublayer($0) } + print("update complete") + print("Time elapsed: \(Date().timeIntervalSince(startTime))") + self.isUpdating = false // フラグを解除 + } + } + func generateColoredMaskImage( + from masksIn: MLMultiArray, protos: MLMultiArray, in size: CGSize, colorIndex: Int, + boundingBox: CGRect + ) -> CGImage? { + let maskWidth = protos.shape[3].intValue + let maskHeight = protos.shape[2].intValue + let maskChannels = protos.shape[1].intValue + + guard protos.shape.count == 4, protos.shape[0].intValue == 1, masksIn.shape.count == 1, + masksIn.shape[0].intValue == maskChannels + else { + print("Invalid shapes for protos or masksIn") + return nil + } + + let masksPointer = masksIn.dataPointer.assumingMemoryBound(to: Float.self) + let protosPointer = protos.dataPointer.assumingMemoryBound(to: Float.self) + + let masksPointerOutput = UnsafeMutablePointer.allocate(capacity: maskHeight * maskWidth) + vDSP_mmul( + masksPointer, 1, protosPointer, 1, masksPointerOutput, 1, vDSP_Length(1), + vDSP_Length(maskHeight * maskWidth), vDSP_Length(maskChannels)) + + let threshold: Float = 0.5 + let color = colorsForMask[colorIndex] + let red = UInt8(color.red) + let green = UInt8(color.green) + let blue = UInt8(color.blue) + + var maskPixels = [UInt8](repeating: 0, count: maskHeight * maskWidth * 4) + for y in 0.. threshold { + let pixelIndex = index * 4 + maskPixels[pixelIndex] = red + maskPixels[pixelIndex + 1] = green + maskPixels[pixelIndex + 2] = blue + maskPixels[pixelIndex + 3] = 255 + } + } + } + + let maskDataPointer = UnsafeMutablePointer.allocate(capacity: maskPixels.count) + maskDataPointer.initialize(from: maskPixels, count: maskPixels.count) + + let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.premultipliedLast.rawValue) + let colorSpace = CGColorSpaceCreateDeviceRGB() + + let maskDataProvider = CGDataProvider( + dataInfo: nil, data: maskDataPointer, size: maskPixels.count + ) { _, data, _ in + data.deallocate() + } + + guard + let maskCGImage = CGImage( + width: maskWidth, height: maskHeight, bitsPerComponent: 8, bitsPerPixel: 32, + bytesPerRow: maskWidth * 4, space: colorSpace, bitmapInfo: bitmapInfo, + provider: maskDataProvider!, decode: nil, shouldInterpolate: true, intent: .defaultIntent) + else { + masksPointerOutput.deallocate() + return nil + } + + let maskCIImage = CIImage(cgImage: maskCGImage) + let scaledCIImage = maskCIImage.transformed( + by: CGAffineTransform( + scaleX: size.width / CGFloat(maskWidth), y: size.height / CGFloat(maskHeight))) + let invertedY = size.height - (boundingBox.origin.y + boundingBox.height) * size.height / 640.0 + let cropRect = CGRect( + x: boundingBox.origin.x * size.width / 640.0, y: invertedY, + width: boundingBox.width * size.width / 640.0, + height: boundingBox.height * size.height / 640.0) + + let croppedCIImage = scaledCIImage.cropped(to: cropRect) + + let ciContext = CIContext() + guard let cgImage = ciContext.createCGImage(croppedCIImage, from: cropRect) else { + masksPointerOutput.deallocate() + return nil + } + + masksPointerOutput.deallocate() + + return cgImage + } + + func removeAllMaskSubLayers() { + self.segmentPoseOverlay.sublayers?.forEach { layer in + layer.removeFromSuperlayer() + } + self.segmentPoseOverlay.sublayers = nil + } + + func adjustBox(_ box: CGRect, toFitIn containerSize: CGSize) -> CGRect { + let xScale = containerSize.width / 640.0 + let yScale = containerSize.height / 640.0 + return CGRect( + x: box.origin.x * xScale, y: box.origin.y * yScale, width: box.size.width * xScale, + height: box.size.height * yScale) + } +} + +extension UIColor { + func toRGBComponents() -> (red: UInt8, green: UInt8, blue: UInt8)? { + var red: CGFloat = 0 + var green: CGFloat = 0 + var blue: CGFloat = 0 + var alpha: CGFloat = 0 + + let success = self.getRed(&red, green: &green, blue: &blue, alpha: &alpha) + + if success { + let redUInt8 = UInt8(red * 255.0) + let greenUInt8 = UInt8(green * 255.0) + let blueUInt8 = UInt8(blue * 255.0) + return (red: redUInt8, green: greenUInt8, blue: blueUInt8) + } else { + return nil + } + } +} + +func nonMaxSuppression(boxes: [CGRect], scores: [Float], threshold: Float) -> [Int] { + let sortedIndices = scores.enumerated().sorted { $0.element > $1.element }.map { $0.offset } + var selectedIndices = [Int]() + var activeIndices = [Bool](repeating: true, count: boxes.count) + + for i in 0.. CGFloat(threshold) * min(boxes[idx].area, boxes[otherIdx].area) { + activeIndices[otherIdx] = false + } + } + } + } + } + return selectedIndices +} + +extension CGRect { + var area: CGFloat { + return width * height + } +} diff --git a/YOLO/ViewController.swift b/YOLO/ViewController.swift index 840d91d..578dba1 100644 --- a/YOLO/ViewController.swift +++ b/YOLO/ViewController.swift @@ -1,7 +1,7 @@ // Ultralytics YOLO 🚀 - AGPL-3.0 License // // Main View Controller for Ultralytics YOLO App -// This file is part of the Ultralytics YOLO app, enabling real-time object detection using YOLOv8 models on iOS devices. +// This file is part of the Ultralytics YOLO app, enabling real-time object detection using YOLO11 models on iOS devices. // Licensed under AGPL-3.0. For commercial use, refer to Ultralytics licensing: https://ultralytics.com/license // Access the source code: https://github.com/ultralytics/yolo-ios-app // @@ -17,12 +17,12 @@ import CoreMedia import UIKit import Vision -var mlModel = try! yolov8m(configuration: .init()).model +var mlModel = try! yolo11m(configuration: .init()).model class ViewController: UIViewController { @IBOutlet var videoPreview: UIView! @IBOutlet var View0: UIView! - @IBOutlet var segmentedControl: UISegmentedControl! + @IBOutlet var modelSegmentedControl: UISegmentedControl! @IBOutlet var playButtonOutlet: UIBarButtonItem! @IBOutlet var pauseButtonOutlet: UIBarButtonItem! @IBOutlet var slider: UISlider! @@ -42,6 +42,11 @@ class ViewController: UIViewController { @IBOutlet weak var activityIndicator: UIActivityIndicatorView! @IBOutlet weak var forcus: UIImageView! @IBOutlet weak var toolBar: UIToolbar! + @IBOutlet weak var taskSegmentControl: UISegmentedControl! + + // views for tasks + var classifyOverlay: UILabel! + var segmentPoseOverlay: CALayer = CALayer() let selection = UISelectionFeedbackGenerator() var detector = try! VNCoreMLModel(for: mlModel) @@ -55,6 +60,9 @@ class ViewController: UIViewController { var t3 = CACurrentMediaTime() // FPS start var t4 = 0.0 // FPS dt smoothed // var cameraOutput: AVCapturePhotoOutput! + var longSide: CGFloat = 3 + var shortSide: CGFloat = 4 + var frameSizeCaptured = false // Developer mode let developerMode = UserDefaults.standard.bool(forKey: "developer_mode") // developer mode selected in settings @@ -73,16 +81,90 @@ class ViewController: UIViewController { return request }() + enum Task { + case detect + case classify + case segment + case pose + case obb + } + + var task: Task = .detect + var confidenceThreshold: Float = 0.25 + var iouThreshold: Float = 0.4 + + var classifyLabels = [String]() + var colorsForMasks: [(red: UInt8, green: UInt8, blue: UInt8)] = [] + var classes: [String] = [] + override func viewDidLoad() { super.viewDidLoad() - slider.value = 30 - setLabels() - setUpBoundingBoxViews() setUpOrientationChangeNotification() startVideo() + setupUI() // setModel() } + func setupUI() { + slider.value = 30 + setLabels() + setUpBoundingBoxViews() + setupColors() + setupClassifyOverlay() + setupSegmentPoseOverlay() + } + + func switchUIForTask() { + switch task { + case .detect: + classifyOverlay.isHidden = true + segmentPoseOverlay.isHidden = true + updateSlider(show: true) + case .classify: + classifyOverlay.isHidden = false + hideBoundingBoxes() + segmentPoseOverlay.isHidden = true + updateSlider(show: false) + case .segment: + segmentPoseOverlay.isHidden = false + hideBoundingBoxes() + classifyOverlay.isHidden = true + updateSlider(show: true) + case .pose: + segmentPoseOverlay.isHidden = false + hideBoundingBoxes() + classifyOverlay.isHidden = true + updateSlider(show: true) + default: + break + } + } + + func updateSlider(show: Bool) { + if show { + labelSlider.isHidden = false + labelSliderConf.isHidden = false + labelSliderIoU.isHidden = false + labelSliderConfLandScape.isHidden = false + labelSliderIoULandScape.isHidden = false + slider.isHidden = false + sliderConf.isHidden = false + sliderIoU.isHidden = false + sliderConfLandScape.isHidden = false + sliderIoULandScape.isHidden = false + + } else { + labelSlider.isHidden = true + labelSliderConf.isHidden = true + labelSliderIoU.isHidden = true + labelSliderConfLandScape.isHidden = true + labelSliderIoULandScape.isHidden = true + sliderConfLandScape.isHidden = true + sliderIoULandScape.isHidden = true + + } + } + override func viewWillTransition( to size: CGSize, with coordinator: any UIViewControllerTransitionCoordinator ) { @@ -116,7 +198,15 @@ class ViewController: UIViewController { } self.videoCapture.previewLayer?.frame = CGRect( x: 0, y: 0, width: size.width, height: size.height) + coordinator.animate( + alongsideTransition: { context in + }, + completion: { context in + self.setupSegmentPoseOverlay() + self.updateClassifyOverlay() + } + ) } private func setUpOrientationChangeNotification() { @@ -127,58 +217,63 @@ class ViewController: UIViewController { @objc func orientationDidChange() { videoCapture.updateVideoOrientation() + // frameSizeCaptured = false } @IBAction func vibrate(_ sender: Any) { selection.selectionChanged() } - @IBAction func indexChanged(_ sender: Any) { - selection.selectionChanged() - activityIndicator.startAnimating() - - /// Switch model - switch segmentedControl.selectedSegmentIndex { + @IBAction func taskSegmentChanged(_ sender: UISegmentedControl) { + switch sender.selectedSegmentIndex { case 0: - self.labelName.text = "YOLOv8n" - mlModel = try! yolov8n(configuration: .init()).model + task = .detect + modelSegmentedControl.setEnabled(true, forSegmentAt: 1) + modelSegmentedControl.setEnabled(true, forSegmentAt: 2) + modelSegmentedControl.setEnabled(true, forSegmentAt: 3) + modelSegmentedControl.setEnabled(true, forSegmentAt: 4) case 1: - self.labelName.text = "YOLOv8s" - mlModel = try! yolov8s(configuration: .init()).model + task = .classify + modelSegmentedControl.selectedSegmentIndex = 0 + updateModelSegmentControl(enableModelIndex: [0, 1, 2, 3, 4], unableModelIndex: []) + showClassifyUI() case 2: - self.labelName.text = "YOLOv8m" - mlModel = try! yolov8m(configuration: .init()).model + task = .segment + modelSegmentedControl.selectedSegmentIndex = 0 + updateModelSegmentControl(enableModelIndex: [0], unableModelIndex: [1, 2, 3, 4]) case 3: - self.labelName.text = "YOLOv8l" - mlModel = try! yolov8l(configuration: .init()).model - case 4: - self.labelName.text = "YOLOv8x" - mlModel = try! yolov8x(configuration: .init()).model + task = .pose + modelSegmentedControl.selectedSegmentIndex = 0 + updateModelSegmentControl(enableModelIndex: [0], unableModelIndex: [1, 2, 3, 4]) default: - break + updateModelSegmentControl(enableModelIndex: [], unableModelIndex: [0, 1, 2, 3, 4]) } + switchUIForTask() setModel() - setUpBoundingBoxViews() - activityIndicator.stopAnimating() + if task == .classify { + setupClassifyLabels() + } else { + setupColors() + } } - func setModel() { + func updateModelSegmentControl(enableModelIndex: [Int], unableModelIndex: [Int]) { + for index in enableModelIndex { + modelSegmentedControl.setEnabled(true, forSegmentAt: index) + } - /// VNCoreMLModel - detector = try! VNCoreMLModel(for: mlModel) - detector.featureProvider = ThresholdProvider() + for index in unableModelIndex { + modelSegmentedControl.setEnabled(false, forSegmentAt: index) + } + } - /// VNCoreMLRequest - let request = VNCoreMLRequest( - model: detector, - completionHandler: { [weak self] request, error in - self?.processObservations(for: request, error: error) - }) - request.imageCropAndScaleOption = .scaleFill // .scaleFit, .scaleFill, .centerCrop - visionRequest = request - t2 = 0.0 // inference dt smoothed - t3 = CACurrentMediaTime() // FPS start - t4 = 0.0 // FPS dt smoothed + @IBAction func indexChanged(_ sender: Any) { + selection.selectionChanged() + activityIndicator.startAnimating() + setModel() + setUpBoundingBoxViews() + setupColors() + activityIndicator.stopAnimating() } /// Update thresholds from slider values @@ -222,7 +317,7 @@ class ViewController: UIViewController { } func setLabels() { - self.labelName.text = "YOLOv8m" + self.labelName.text = "YOLO11m" self.labelVersion.text = "Version " + UserDefaults.standard.string(forKey: "app_version")! } @@ -279,6 +374,29 @@ class ViewController: UIViewController { let maxBoundingBoxViews = 100 var boundingBoxViews = [BoundingBoxView]() var colors: [String: UIColor] = [:] + var colorsForMask: [(red: UInt8, green: UInt8, blue: UInt8)] = [] + let ultralyticsColorsolors: [UIColor] = [ + UIColor(red: 4 / 255, green: 42 / 255, blue: 255 / 255, alpha: 0.6), // #042AFF + UIColor(red: 11 / 255, green: 219 / 255, blue: 235 / 255, alpha: 0.6), // #0BDBEB + UIColor(red: 243 / 255, green: 243 / 255, blue: 243 / 255, alpha: 0.6), // #F3F3F3 + UIColor(red: 0 / 255, green: 223 / 255, blue: 183 / 255, alpha: 0.6), // #00DFB7 + UIColor(red: 17 / 255, green: 31 / 255, blue: 104 / 255, alpha: 0.6), // #111F68 + UIColor(red: 255 / 255, green: 111 / 255, blue: 221 / 255, alpha: 0.6), // #FF6FDD + UIColor(red: 255 / 255, green: 68 / 255, blue: 79 / 255, alpha: 0.6), // #FF444F + UIColor(red: 204 / 255, green: 237 / 255, blue: 0 / 255, alpha: 0.6), // #CCED00 + UIColor(red: 0 / 255, green: 243 / 255, blue: 68 / 255, alpha: 0.6), // #00F344 + UIColor(red: 189 / 255, green: 0 / 255, blue: 255 / 255, alpha: 0.6), // #BD00FF + UIColor(red: 0 / 255, green: 180 / 255, blue: 255 / 255, alpha: 0.6), // #00B4FF + UIColor(red: 221 / 255, green: 0 / 255, blue: 186 / 255, alpha: 0.6), // #DD00BA + UIColor(red: 0 / 255, green: 255 / 255, blue: 255 / 255, alpha: 0.6), // #00FFFF + UIColor(red: 38 / 255, green: 192 / 255, blue: 0 / 255, alpha: 0.6), // #26C000 + UIColor(red: 1 / 255, green: 255 / 255, blue: 179 / 255, alpha: 0.6), // #01FFB3 + UIColor(red: 125 / 255, green: 36 / 255, blue: 255 / 255, alpha: 0.6), // #7D24FF + UIColor(red: 123 / 255, green: 0 / 255, blue: 104 / 255, alpha: 0.6), // #7B0068 + UIColor(red: 255 / 255, green: 27 / 255, blue: 108 / 255, alpha: 0.6), // #FF1B6C + UIColor(red: 252 / 255, green: 109 / 255, blue: 47 / 255, alpha: 0.6), // #FC6D2F + UIColor(red: 162 / 255, green: 255 / 255, blue: 11 / 255, alpha: 0.6), // #A2FF0B + ] func setUpBoundingBoxViews() { // Ensure all bounding box views are initialized up to the maximum allowed. @@ -287,20 +405,45 @@ class ViewController: UIViewController { } // Retrieve class labels directly from the CoreML model's class labels, if available. + } + + func setupColors() { guard let classLabels = mlModel.modelDescription.classLabels as? [String] else { - fatalError("Class labels are missing from the model description") + print("Class labels are missing from the model description") + return } - + classes = classLabels // Assign random colors to the classes. + var count = 0 for label in classLabels { + let color = ultralyticsColorsolors[count] + count += 1 + if count > 19 { + count = 0 + } if colors[label] == nil { // if key not in dict - colors[label] = UIColor( - red: CGFloat.random(in: 0...1), - green: CGFloat.random(in: 0...1), - blue: CGFloat.random(in: 0...1), - alpha: 0.6) + colors[label] = color } } + + count = 0 + for (key, color) in colors { + let color = ultralyticsColorsolors[count] + count += 1 + if count > 19 { + count = 0 + } + guard let colorForMask = color.toRGBComponents() else { fatalError() } + colorsForMask.append(colorForMask) + } + } + + func setupClassifyLabels() { + guard let classLabels = mlModel.modelDescription.classLabels as? [String] else { + print("Class labels are missing from the model description") + return + } + classifyLabels = classLabels } func startVideo() { @@ -315,6 +458,7 @@ class ViewController: UIViewController { self.videoPreview.layer.addSublayer(previewLayer) self.videoCapture.previewLayer?.frame = self.videoPreview.bounds // resize preview layer } + self.videoPreview.layer.addSublayer(self.segmentPoseOverlay) // Add the bounding box layers to the UI, on top of the video preview. for box in self.boundingBoxViews { @@ -327,64 +471,6 @@ class ViewController: UIViewController { } } - func predict(sampleBuffer: CMSampleBuffer) { - if currentBuffer == nil, let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) { - currentBuffer = pixelBuffer - - /// - Tag: MappingOrientation - // The frame is always oriented based on the camera sensor, - // so in most cases Vision needs to rotate it for the model to work as expected. - let imageOrientation: CGImagePropertyOrientation - switch UIDevice.current.orientation { - case .portrait: - imageOrientation = .up - case .portraitUpsideDown: - imageOrientation = .down - case .landscapeLeft: - imageOrientation = .up - case .landscapeRight: - imageOrientation = .up - case .unknown: - imageOrientation = .up - default: - imageOrientation = .up - } - - // Invoke a VNRequestHandler with that image - let handler = VNImageRequestHandler( - cvPixelBuffer: pixelBuffer, orientation: imageOrientation, options: [:]) - if UIDevice.current.orientation != .faceUp { // stop if placed down on a table - t0 = CACurrentMediaTime() // inference start - do { - try handler.perform([visionRequest]) - } catch { - print(error) - } - t1 = CACurrentMediaTime() - t0 // inference dt - } - - currentBuffer = nil - } - } - - func processObservations(for request: VNRequest, error: Error?) { - DispatchQueue.main.async { - if let results = request.results as? [VNRecognizedObjectObservation] { - self.show(predictions: results) - } else { - self.show(predictions: []) - } - - // Measure FPS - if self.t1 < 10.0 { // valid dt - self.t2 = self.t1 * 0.05 + self.t2 * 0.95 // smoothed inference time - } - self.t4 = (CACurrentMediaTime() - self.t3) * 0.05 + self.t4 * 0.95 // smoothed delivered FPS - self.labelFPS.text = String(format: "%.1f FPS - %.1f ms", 1 / self.t4, self.t2 * 1000) // t2 seconds to ms - self.t3 = CACurrentMediaTime() - } - } - // Save text file func saveText(text: String, file: String = "saved.txt") { if let dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first { @@ -448,134 +534,6 @@ class ViewController: UIViewController { } } - func show(predictions: [VNRecognizedObjectObservation]) { - let width = videoPreview.bounds.width // 375 pix - let height = videoPreview.bounds.height // 812 pix - var str = "" - - // ratio = videoPreview AR divided by sessionPreset AR - var ratio: CGFloat = 1.0 - if videoCapture.captureSession.sessionPreset == .photo { - ratio = (height / width) / (4.0 / 3.0) // .photo - } else { - ratio = (height / width) / (16.0 / 9.0) // .hd4K3840x2160, .hd1920x1080, .hd1280x720 etc. - } - - // date - let date = Date() - let calendar = Calendar.current - let hour = calendar.component(.hour, from: date) - let minutes = calendar.component(.minute, from: date) - let seconds = calendar.component(.second, from: date) - let nanoseconds = calendar.component(.nanosecond, from: date) - let sec_day = - Double(hour) * 3600.0 + Double(minutes) * 60.0 + Double(seconds) + Double(nanoseconds) / 1E9 // seconds in the day - - self.labelSlider.text = - String(predictions.count) + " items (max " + String(Int(slider.value)) + ")" - for i in 0..= 1 { // iPhone ratio = 1.218 - let offset = (1 - ratio) * (0.5 - rect.minX) - let transform = CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: offset, y: -1) - rect = rect.applying(transform) - rect.size.width *= ratio - } else { // iPad ratio = 0.75 - let offset = (ratio - 1) * (0.5 - rect.maxY) - let transform = CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: 0, y: offset - 1) - rect = rect.applying(transform) - ratio = (height / width) / (3.0 / 4.0) - rect.size.height /= ratio - } - - // Scale normalized to pixels [375, 812] [width, height] - rect = VNImageRectForNormalizedRect(rect, Int(width), Int(height)) - - // The labels array is a list of VNClassificationObservation objects, - // with the highest scoring class first in the list. - let bestClass = prediction.labels[0].identifier - let confidence = prediction.labels[0].confidence - // print(confidence, rect) // debug (confidence, xywh) with xywh origin top left (pixels) - let label = String(format: "%@ %.1f", bestClass, confidence * 100) - let alpha = CGFloat((confidence - 0.2) / (1.0 - 0.2) * 0.9) - // Show the bounding box. - boundingBoxViews[i].show( - frame: rect, - label: label, - color: colors[bestClass] ?? UIColor.white, - alpha: alpha) // alpha 0 (transparent) to 1 (opaque) for conf threshold 0.2 to 1.0) - - if developerMode { - // Write - if save_detections { - str += String( - format: "%.3f %.3f %.3f %@ %.2f %.1f %.1f %.1f %.1f\n", - sec_day, freeSpace(), UIDevice.current.batteryLevel, bestClass, confidence, - rect.origin.x, rect.origin.y, rect.size.width, rect.size.height) - } - - // Action trigger upon detection - // if false { - // if (bestClass == "car") { // "cell phone", "car", "person" - // self.takePhoto(nil) - // // self.pauseButton(nil) - // sleep(2) - // } - // } - } - } else { - boundingBoxViews[i].hide() - } - } - - // Write - if developerMode { - if save_detections { - saveText(text: str, file: "detections.txt") // Write stats for each detection - } - if save_frames { - str = String( - format: "%.3f %.3f %.3f %.3f %.1f %.1f %.1f\n", - sec_day, freeSpace(), memoryUsage(), UIDevice.current.batteryLevel, - self.t1 * 1000, self.t2 * 1000, 1 / self.t4) - saveText(text: str, file: "frames.txt") // Write stats for each image - } - } - - // Debug - // print(str) - // print(UIDevice.current.identifierForVendor!) - // saveImage() - } - // Pinch to Zoom Start --------------------------------------------------------------------------------------------- let minimumZoom: CGFloat = 1.0 let maximumZoom: CGFloat = 10.0