diff --git a/.github/workflows/release-cli.yml b/.github/workflows/release-cli.yml index c1e4aef9..2a6c81b9 100644 --- a/.github/workflows/release-cli.yml +++ b/.github/workflows/release-cli.yml @@ -77,8 +77,11 @@ jobs: - name: Create deployment package run: | - ls -R target/ - tar -czf screenpipe-${{ env.VERSION }}-${{ matrix.target }}.tar.gz -C target/${{ matrix.target }}/release screenpipe + mkdir -p screenpipe-${{ env.VERSION }}-${{ matrix.target }}/bin + mkdir -p screenpipe-${{ env.VERSION }}-${{ matrix.target }}/lib + cp target/${{ matrix.target }}/release/screenpipe screenpipe-${{ env.VERSION }}-${{ matrix.target }}/bin/ + cp target/${{ matrix.target }}/release/libscreenpipe.dylib screenpipe-${{ env.VERSION }}-${{ matrix.target }}/lib/ + tar -czf screenpipe-${{ env.VERSION }}-${{ matrix.target }}.tar.gz -C screenpipe-${{ env.VERSION }}-${{ matrix.target }} . - name: Calculate SHA256 run: | diff --git a/Formula/screenpipe.rb b/Formula/screenpipe.rb index 04ae2678..22925104 100644 --- a/Formula/screenpipe.rb +++ b/Formula/screenpipe.rb @@ -18,7 +18,8 @@ class Screenpipe < Formula depends_on "tesseract" def install - bin.install "screenpipe" + bin.install "screenpipe" + lib.install "libscreenpipe.dylib" end test do diff --git a/examples/apps/screenpipe-app-tauri/src-tauri/tauri.macos.conf.json b/examples/apps/screenpipe-app-tauri/src-tauri/tauri.macos.conf.json index 78d6e91f..9a645845 100644 --- a/examples/apps/screenpipe-app-tauri/src-tauri/tauri.macos.conf.json +++ b/examples/apps/screenpipe-app-tauri/src-tauri/tauri.macos.conf.json @@ -9,7 +9,8 @@ "ffmpeg/lib/libavutil.59.dylib", "ffmpeg/lib/libffmpeg.7.dylib", "ffmpeg/lib/libswresample.5.dylib", - "ffmpeg/lib/libswscale.8.dylib" + "ffmpeg/lib/libswscale.8.dylib", + "../../../../screenpipe-vision/lib/libscreenpipe.dylib" ], "entitlements": "entitlements.plist", "signingIdentity": "-", diff --git a/screenpipe-server/src/bin/screenpipe-server.rs b/screenpipe-server/src/bin/screenpipe-server.rs index 63f6d9c1..397dcadb 100644 --- a/screenpipe-server/src/bin/screenpipe-server.rs +++ b/screenpipe-server/src/bin/screenpipe-server.rs @@ -15,8 +15,7 @@ use crossbeam::queue::SegQueue; use dirs::home_dir; use log::{debug, error, info, LevelFilter}; use screenpipe_audio::{ - default_input_device, default_output_device, list_audio_devices, parse_audio_device, - DeviceControl, + default_input_device, list_audio_devices, parse_audio_device, DeviceControl, }; use screenpipe_vision::OcrEngine; use std::io::Write; @@ -34,6 +33,7 @@ enum CliOcrEngine { Unstructured, Tesseract, WindowsNative, + AppleNative, } impl From for CoreOcrEngine { @@ -42,6 +42,7 @@ impl From for CoreOcrEngine { CliOcrEngine::Unstructured => CoreOcrEngine::Unstructured, CliOcrEngine::Tesseract => CoreOcrEngine::Tesseract, CliOcrEngine::WindowsNative => CoreOcrEngine::WindowsNative, + CliOcrEngine::AppleNative => CoreOcrEngine::AppleNative, } } } @@ -233,14 +234,16 @@ async fn main() -> anyhow::Result<()> { } // audio output only supported on linux atm // see https://github.com/louis030195/screen-pipe/pull/106 - #[cfg(target_os = "linux")] - if let Ok(output_device) = default_output_device() { - audio_devices.push(Arc::new(output_device.clone())); - let device_control = DeviceControl { - is_running: true, - is_paused: false, - }; - devices_status.insert(output_device, device_control); + if cfg!(target_os = "linux") { + use screenpipe_audio::default_output_device; + if let Ok(output_device) = default_output_device() { + audio_devices.push(Arc::new(output_device.clone())); + let device_control = DeviceControl { + is_running: true, + is_paused: false, + }; + devices_status.insert(output_device, device_control); + } } } else { // Use specified devices diff --git a/screenpipe-server/src/video.rs b/screenpipe-server/src/video.rs index f2ce581a..37f94fc3 100644 --- a/screenpipe-server/src/video.rs +++ b/screenpipe-server/src/video.rs @@ -2,7 +2,8 @@ use chrono::Utc; use image::ImageFormat::{self}; use log::{debug, error, info, warn}; use screenpipe_core::find_ffmpeg_path; -use screenpipe_vision::{continuous_capture, get_monitor, CaptureResult, OcrEngine}; +use screenpipe_vision::core::get_monitor; +use screenpipe_vision::{continuous_capture, CaptureResult, OcrEngine}; use std::collections::VecDeque; use std::path::PathBuf; use std::process::Stdio; diff --git a/screenpipe-vision/Cargo.toml b/screenpipe-vision/Cargo.toml index a9274e8f..e5eff48a 100644 --- a/screenpipe-vision/Cargo.toml +++ b/screenpipe-vision/Cargo.toml @@ -66,6 +66,13 @@ assert_cmd = "2.0.14" predicates = "3.1.0" assert_fs = "1.1.1" + +[build-dependencies] +cc = "1.0" + +[package.metadata.osx] +framework = ["Vision", "AppKit"] + [[bin]] name = "screenpipe-vision" path = "src/bin/screenpipe-vision.rs" @@ -78,3 +85,5 @@ harness = false [target.'cfg(target_os = "windows")'.dependencies] windows = { version = "0.58", features = ["Graphics_Imaging", "Media_Ocr", "Storage", "Storage_Streams"] } +[target.'cfg(target_os = "macos")'.dependencies] +libc = "0.2" diff --git a/screenpipe-vision/build.rs b/screenpipe-vision/build.rs new file mode 100644 index 00000000..2a14c61f --- /dev/null +++ b/screenpipe-vision/build.rs @@ -0,0 +1,19 @@ +use std::env; + +#[cfg(target_os = "macos")] +fn main() { + let destination = env::var("DESTINATION").unwrap_or_default(); + + if destination == "brew" { + println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../lib"); + } else if destination == "tauri" { + println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../Frameworks"); + } else { + println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../../screenpipe-vision/lib"); + } + + println!("cargo:rustc-link-lib=dylib=screenpipe"); +} + +#[cfg(not(target_os = "macos"))] +fn main() {} diff --git a/screenpipe-vision/src/apple.rs b/screenpipe-vision/src/apple.rs new file mode 100644 index 00000000..101c9780 --- /dev/null +++ b/screenpipe-vision/src/apple.rs @@ -0,0 +1,31 @@ +use image::DynamicImage; +use std::ffi::CStr; +use std::os::raw::{c_char, c_uchar}; + +#[link(name = "screenpipe")] +extern "C" { + fn perform_ocr( + image_data: *const c_uchar, + length: usize, + width: i32, + height: i32, + ) -> *mut c_char; +} + +pub fn perform_ocr_apple(image: &DynamicImage) -> String { + let rgba = image.to_rgba8(); + let (width, height) = rgba.dimensions(); + let raw_data = rgba.as_raw(); + + unsafe { + let result_ptr = perform_ocr( + raw_data.as_ptr(), + raw_data.len(), + width as i32, + height as i32, + ); + let result = CStr::from_ptr(result_ptr).to_string_lossy().into_owned(); + libc::free(result_ptr as *mut libc::c_void); + result + } +} diff --git a/screenpipe-vision/src/bin/screenpipe-vision.rs b/screenpipe-vision/src/bin/screenpipe-vision.rs index ba72ca2c..a745ffd1 100644 --- a/screenpipe-vision/src/bin/screenpipe-vision.rs +++ b/screenpipe-vision/src/bin/screenpipe-vision.rs @@ -1,5 +1,5 @@ use clap::Parser; -use screenpipe_vision::{continuous_capture, get_monitor, OcrEngine}; +use screenpipe_vision::{continuous_capture, core::get_monitor, OcrEngine}; use std::{sync::Arc, time::Duration}; use tokio::sync::mpsc::channel; diff --git a/screenpipe-vision/src/core.rs b/screenpipe-vision/src/core.rs index 3111ede0..2f53bfff 100644 --- a/screenpipe-vision/src/core.rs +++ b/screenpipe-vision/src/core.rs @@ -11,15 +11,21 @@ use strsim::levenshtein; use tokio::sync::{mpsc::Sender, Mutex}; // Corrected import for Mutex use xcap::{Monitor, Window}; +#[cfg(target_os = "macos")] +use crate::apple::perform_ocr_apple; #[cfg(target_os = "windows")] use crate::utils::perform_ocr_windows; use crate::utils::OcrEngine; use crate::utils::{ - capture_screenshot, compare_with_previous_image, perform_ocr_tesseract, - save_text_files, + capture_screenshot, compare_with_previous_image, perform_ocr_tesseract, save_text_files, }; -use rusty_tesseract::{Data, DataOutput}; // Add this import +use rusty_tesseract::{Data, DataOutput}; use screenpipe_integrations::unstructured_ocr::perform_ocr_cloud; +pub enum ControlMessage { + Pause, + Resume, + Stop, +} pub struct DataOutputWrapper { pub data_output: rusty_tesseract::tesseract::output_data::DataOutput, @@ -135,7 +141,10 @@ pub async fn continuous_capture( // Skip the frame if the current average difference is less than 0.006 if current_average < 0.006 { - debug!("Skipping frame {} due to low average difference: {:.3}", frame_counter, current_average); + debug!( + "Skipping frame {} due to low average difference: {:.3}", + frame_counter, current_average + ); frame_counter += 1; tokio::time::sleep(interval).await; continue; @@ -194,7 +203,7 @@ pub async fn continuous_capture( }); frame_counter = 0; // Reset frame_counter after OCR task is processed - // Reset max_average and max_avg_value after spawning the OCR task + // Reset max_average and max_avg_value after spawning the OCR task max_avg_value = 0.0; } } @@ -244,6 +253,23 @@ pub async fn process_ocr_task( debug!("Windows Native OCR"); perform_ocr_windows(&image_arc).await } + #[cfg(target_os = "macos")] + OcrEngine::AppleNative => { + debug!("Apple Native OCR"); + let text = perform_ocr_apple(&image_arc); + ( + text.clone(), + DataOutput { + output: String::new(), + data: vec![], + }, + serde_json::json!([{ + "text": text, + "confidence": "1.0", + }]) + .to_string(), + ) + } _ => { error!("Unsupported OCR engine"); return Err(std::io::Error::new( @@ -324,4 +350,4 @@ pub async fn process_ocr_task( frame_number, _duration ); Ok(()) -} \ No newline at end of file +} diff --git a/screenpipe-vision/src/lib.rs b/screenpipe-vision/src/lib.rs index 349c8668..1e5636f1 100644 --- a/screenpipe-vision/src/lib.rs +++ b/screenpipe-vision/src/lib.rs @@ -1,4 +1,7 @@ +pub mod apple; pub mod core; pub mod utils; -pub use core::{continuous_capture, get_monitor, process_ocr_task, CaptureResult}; +#[cfg(target_os = "macos")] +pub use apple::perform_ocr_apple; +pub use core::{continuous_capture, process_ocr_task, CaptureResult, ControlMessage}; pub use utils::{perform_ocr_tesseract, OcrEngine}; diff --git a/screenpipe-vision/src/ocr.swift b/screenpipe-vision/src/ocr.swift new file mode 100644 index 00000000..d7cdf83d --- /dev/null +++ b/screenpipe-vision/src/ocr.swift @@ -0,0 +1,87 @@ +import CoreGraphics +import Foundation +import Vision + +@_cdecl("perform_ocr") +public func performOCR(imageData: UnsafePointer, length: Int, width: Int, height: Int) + -> UnsafeMutablePointer? { + + // print("Attempting to create image from raw data") + // print("Image dimensions: \(width)x\(height)") + + guard let dataProvider = CGDataProvider(data: Data(bytes: imageData, count: length) as CFData) + else { + // print("Failed to create CGDataProvider.") + return strdup("Error: Failed to create CGDataProvider") + } + + guard + let cgImage = CGImage( + width: width, + height: height, + bitsPerComponent: 8, + bitsPerPixel: 32, + bytesPerRow: width * 4, + space: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.premultipliedLast.rawValue), + provider: dataProvider, + decode: nil, + shouldInterpolate: false, + intent: .defaultIntent + ) + else { + // print("Failed to create CGImage.") + return strdup("Error: Failed to create CGImage") + } + + // print("CGImage created successfully.") + + let semaphore = DispatchSemaphore(value: 0) + var ocrResult = "" + + let request = VNRecognizeTextRequest { request, error in + defer { semaphore.signal() } + + if let error = error { + // print("Error in text recognition request: \(error)") + ocrResult = "Error: \(error.localizedDescription)" + return + } + + guard let observations = request.results as? [VNRecognizedTextObservation] else { + // print("Failed to process image or no text found.") + ocrResult = "Error: Failed to process image or no text found" + return + } + + // print("Number of text observations: \(observations.count)") + + for (_, observation) in observations.enumerated() { + guard let topCandidate = observation.topCandidates(1).first else { + // print("No top candidate for observation \(index)") + continue + } + ocrResult += "\(topCandidate.string)\n" + } + } + + request.recognitionLevel = .accurate + + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + do { + // print("Performing OCR...") + try handler.perform([request]) + } catch { + // print("Failed to perform OCR: \(error)") + return strdup("Error: Failed to perform OCR - \(error.localizedDescription)") + } + + semaphore.wait() + + return strdup(ocrResult.isEmpty ? "No text found" : ocrResult) +} + +// swiftc -emit-library -o screenpipe-vision/lib/libscreenpipe.dylib screenpipe-vision/src/ocr.swift + +// or +// swiftc -emit-library -o /usr/local/lib/libscreenpipe.dylib screenpipe-vision/src/ocr.swift diff --git a/screenpipe-vision/src/utils.rs b/screenpipe-vision/src/utils.rs index 509ac0cb..972a9efc 100644 --- a/screenpipe-vision/src/utils.rs +++ b/screenpipe-vision/src/utils.rs @@ -1,4 +1,4 @@ -use crate::core::MaxAverageFrame; // Assuming core.rs is in the same crate under the `core` module +use crate::core::MaxAverageFrame; use image::DynamicImage; use image_compare::{Algorithm, Metric, Similarity}; // Added import for Similarity use log::{debug, error}; @@ -18,6 +18,7 @@ pub enum OcrEngine { Unstructured, Tesseract, WindowsNative, + AppleNative, } impl Default for OcrEngine { diff --git a/screenpipe-vision/tests/apple_vision_test.rs b/screenpipe-vision/tests/apple_vision_test.rs new file mode 100644 index 00000000..a331b10c --- /dev/null +++ b/screenpipe-vision/tests/apple_vision_test.rs @@ -0,0 +1,37 @@ +#[cfg(target_os = "macos")] +#[cfg(test)] +mod tests { + use image::GenericImageView; + use screenpipe_vision::perform_ocr_apple; + use std::path::PathBuf; + + #[tokio::test] + async fn test_apple_native_ocr() { + let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + path.push("tests"); + path.push("testing_OCR.png"); + println!("Path to testing_OCR.png: {:?}", path); + + // Check if file exists and print its size + if let Ok(metadata) = std::fs::metadata(&path) { + println!("File size: {} bytes", metadata.len()); + } + + // Attempt to open the image + let image = image::open(&path).expect("Failed to open image"); + println!("Image dimensions: {:?}", image.dimensions()); + + // Convert image to RGB format + let rgb_image = image.to_rgb8(); + println!("RGB image dimensions: {:?}", rgb_image.dimensions()); + + let result = perform_ocr_apple(&image); + + println!("OCR text: {:?}", result); + assert!( + result.contains("ocr_tx.receiver_count"), + "OCR failed: {:?}", + result + ); + } +}