Skip to content

Commit

Permalink
feat: apple native OCR, works
Browse files Browse the repository at this point in the history
  • Loading branch information
louis030195 committed Aug 8, 2024
1 parent fc1356f commit a585f9b
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 194 deletions.
2 changes: 2 additions & 0 deletions screenpipe-server/src/bin/screenpipe-server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ enum CliOcrEngine {
Unstructured,
Tesseract,
WindowsNative,
AppleNative,
}

impl From<CliOcrEngine> for CoreOcrEngine {
Expand All @@ -42,6 +43,7 @@ impl From<CliOcrEngine> for CoreOcrEngine {
CliOcrEngine::Unstructured => CoreOcrEngine::Unstructured,
CliOcrEngine::Tesseract => CoreOcrEngine::Tesseract,
CliOcrEngine::WindowsNative => CoreOcrEngine::WindowsNative,
CliOcrEngine::AppleNative => CoreOcrEngine::AppleNative,
}
}
}
Expand Down
19 changes: 8 additions & 11 deletions screenpipe-vision/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ assert_cmd = "2.0.14"
predicates = "3.1.0"
assert_fs = "1.1.1"


[build-dependencies]
cc = "1.0"

[package.metadata.osx]
framework = ["Vision", "AppKit"]

[[bin]]
name = "screenpipe-vision"
path = "src/bin/screenpipe-vision.rs"
Expand All @@ -79,14 +86,4 @@ harness = false
windows = { version = "0.58", features = ["Graphics_Imaging", "Media_Ocr", "Storage", "Storage_Streams"] }

[target.'cfg(target_os = "macos")'.dependencies]
core-foundation = "0.9.4"
core-graphics = "0.23.2"
objc = { version = "0.2.7", features = ["exception"] }
block = "0.1"
foreign-types-shared = "0.1"
cocoa-foundation = "0.1.2"



[build-dependencies]
cc = "1.0"
libc = "0.2"
6 changes: 2 additions & 4 deletions screenpipe-vision/build.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
#[cfg(target_os = "macos")]
fn main() {
// println!("cargo:rustc-link-lib=framework=Vision");
// println!("cargo:rustc-link-lib=framework=Foundation");
// println!("cargo:rustc-link-lib=framework=CoreGraphics");
println!("cargo:rustc-link-lib=framework=Vision");
println!("cargo:rustc-link-search=native=screenpipe-vision/lib");
println!("cargo:rustc-link-lib=dylib=ocr");
}

#[cfg(not(target_os = "macos"))]
Expand Down
31 changes: 31 additions & 0 deletions screenpipe-vision/src/apple.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use image::DynamicImage;
use std::ffi::{CStr, CString};
use std::os::raw::{c_char, c_uchar};

#[link(name = "ocr")]
extern "C" {
fn perform_ocr(
image_data: *const c_uchar,
length: usize,
width: i32,
height: i32,
) -> *mut c_char;
}

pub fn perform_ocr_apple(image: &DynamicImage) -> String {
let rgba = image.to_rgba8();
let (width, height) = rgba.dimensions();
let raw_data = rgba.as_raw();

unsafe {
let result_ptr = perform_ocr(
raw_data.as_ptr(),
raw_data.len(),
width as i32,
height as i32,
);
let result = CStr::from_ptr(result_ptr).to_string_lossy().into_owned();
libc::free(result_ptr as *mut libc::c_void);
result
}
}
36 changes: 29 additions & 7 deletions screenpipe-vision/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,22 @@ use strsim::levenshtein;
use tokio::sync::{mpsc::Sender, Mutex}; // Corrected import for Mutex
use xcap::{Monitor, Window};

#[cfg(target_os = "macos")]
use crate::apple::perform_ocr_apple;
#[cfg(target_os = "windows")]
use crate::utils::perform_ocr_windows;
use crate::utils::OcrEngine;
use crate::utils::{
capture_screenshot, compare_with_previous_image, perform_ocr_tesseract,
save_text_files,
capture_screenshot, compare_with_previous_image, perform_ocr_tesseract, save_text_files,
};
use crate::utils::{perform_ocr_apple, OcrEngine};
use rusty_tesseract::{Data, DataOutput}; // Add this import
use rusty_tesseract::{Data, DataOutput};
use screenpipe_integrations::unstructured_ocr::perform_ocr_cloud;
pub enum ControlMessage {
Pause,
Resume,
Stop,
}

pub struct DataOutputWrapper {
pub data_output: rusty_tesseract::tesseract::output_data::DataOutput,
Expand Down Expand Up @@ -135,7 +142,10 @@ pub async fn continuous_capture(

// Skip the frame if the current average difference is less than 0.006
if current_average < 0.006 {
debug!("Skipping frame {} due to low average difference: {:.3}", frame_counter, current_average);
debug!(
"Skipping frame {} due to low average difference: {:.3}",
frame_counter, current_average
);
frame_counter += 1;
tokio::time::sleep(interval).await;
continue;
Expand Down Expand Up @@ -194,7 +204,7 @@ pub async fn continuous_capture(
});

frame_counter = 0; // Reset frame_counter after OCR task is processed
// Reset max_average and max_avg_value after spawning the OCR task
// Reset max_average and max_avg_value after spawning the OCR task
max_avg_value = 0.0;
}
}
Expand Down Expand Up @@ -247,7 +257,19 @@ pub async fn process_ocr_task(
#[cfg(target_os = "macos")]
OcrEngine::AppleNative => {
debug!("Apple Native OCR");
perform_ocr_apple(&image_arc)
let text = perform_ocr_apple(&image_arc);
(
text.clone(),
DataOutput {
output: String::new(),
data: vec![],
},
serde_json::json!([{
"text": text,
"confidence": "1.0",
}])
.to_string(),
)
}
_ => {
error!("Unsupported OCR engine");
Expand Down Expand Up @@ -329,4 +351,4 @@ pub async fn process_ocr_task(
frame_number, _duration
);
Ok(())
}
}
5 changes: 4 additions & 1 deletion screenpipe-vision/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
pub mod apple;
pub mod core;
pub mod utils;
pub use core::{continuous_capture, get_monitor, process_ocr_task, CaptureResult};
#[cfg(target_os = "macos")]
pub use apple::perform_ocr_apple;
pub use core::{continuous_capture, process_ocr_task, CaptureResult, ControlMessage};
pub use utils::{perform_ocr_tesseract, OcrEngine};
86 changes: 86 additions & 0 deletions screenpipe-vision/src/ocr.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import CoreGraphics
import Foundation
import Vision

@_cdecl("perform_ocr")
public func performOCR(imageData: UnsafePointer<UInt8>, length: Int, width: Int, height: Int)
-> UnsafeMutablePointer<CChar>? {

print("Attempting to create image from raw data")
print("Image dimensions: \(width)x\(height)")

guard let dataProvider = CGDataProvider(data: Data(bytes: imageData, count: length) as CFData)
else {
print("Failed to create CGDataProvider.")
return strdup("Error: Failed to create CGDataProvider")
}

guard
let cgImage = CGImage(
width: width,
height: height,
bitsPerComponent: 8,
bitsPerPixel: 32,
bytesPerRow: width * 4,
space: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.premultipliedLast.rawValue),
provider: dataProvider,
decode: nil,
shouldInterpolate: false,
intent: .defaultIntent
)
else {
print("Failed to create CGImage.")
return strdup("Error: Failed to create CGImage")
}

print("CGImage created successfully.")

let semaphore = DispatchSemaphore(value: 0)
var ocrResult = ""

let request = VNRecognizeTextRequest { request, error in
defer { semaphore.signal() }

if let error = error {
print("Error in text recognition request: \(error)")
ocrResult = "Error: \(error.localizedDescription)"
return
}

guard let observations = request.results as? [VNRecognizedTextObservation] else {
print("Failed to process image or no text found.")
ocrResult = "Error: Failed to process image or no text found"
return
}

print("Number of text observations: \(observations.count)")

for (index, observation) in observations.enumerated() {
guard let topCandidate = observation.topCandidates(1).first else {
print("No top candidate for observation \(index)")
continue
}
ocrResult += "\(topCandidate.string)\n"
}
}

request.recognitionLevel = .accurate

let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
do {
print("Performing OCR...")
try handler.perform([request])
} catch {
print("Failed to perform OCR: \(error)")
return strdup("Error: Failed to perform OCR - \(error.localizedDescription)")
}

semaphore.wait()

return strdup(ocrResult.isEmpty ? "No text found" : ocrResult)
}

// swiftc -emit-library -o screenpipe-vision/lib/libocr.dylib screenpipe-vision/src/ocr.swift
// or
// swiftc -emit-library -o /usr/local/lib/libocr.dylib screenpipe-vision/src/ocr.swift
126 changes: 0 additions & 126 deletions screenpipe-vision/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
use crate::core::MaxAverageFrame;
use core_foundation::base::FromVoid;
use core_graphics::base::kCGRenderingIntentDefault;
// Assuming core.rs is in the same crate under the `core` module
use image::codecs::png::PngEncoder;
use image::DynamicImage;
Expand Down Expand Up @@ -296,127 +294,3 @@ pub async fn perform_ocr_windows(image: &DynamicImage) -> (String, DataOutput, S

(text, data_output, json_output)
}

#[cfg(target_os = "macos")]
#[link(name = "Vision", kind = "framework")]
extern "C" {}

use core_foundation::string::CFString;
use core_graphics::color_space::CGColorSpace;
use core_graphics::data_provider::CGDataProvider;
use core_graphics::image::CGImage;
use core_graphics::image::CGImageAlphaInfo;
use objc::runtime::{Class, Object};
use objc::{msg_send, sel, sel_impl};
pub fn perform_ocr_apple(image: &DynamicImage) -> (String, DataOutput, String) {
// Convert DynamicImage to CGImage
let rgba_image = image.to_rgba8();
let (width, height) = rgba_image.dimensions();
let bytes_per_row = width as usize * 4;
let data = rgba_image.into_raw();

let cg_image = {
let provider = CGDataProvider::from_buffer(Arc::new(data));
CGImage::new(
width as usize,
height as usize,
8, // bits per component
32, // bits per pixel
bytes_per_row,
&CGColorSpace::create_device_rgb(),
CGImageAlphaInfo::CGImageAlphaPremultipliedLast as u32,
&provider,
true,
kCGRenderingIntentDefault,
)
};

unsafe {
let vision_class =
Class::get("VNRecognizeTextRequest").expect("VNRecognizeTextRequest class not found");
let request: *mut Object = msg_send![vision_class, alloc];
let request: *mut Object = msg_send![request, init];

println!("VNRecognizeTextRequest created successfully");

// Set up the request parameters
let _: () = msg_send![request, setRecognitionLevel:1]; // VNRequestTextRecognitionLevelAccurate
let _: () = msg_send![request, setUsesLanguageCorrection:true];

// Create VNImageRequestHandler
let handler_class =
Class::get("VNImageRequestHandler").expect("VNImageRequestHandler class not found");
let handler: *mut Object = msg_send![handler_class, alloc];
let handler: *mut Object = msg_send![handler, initWithCGImage:cg_image options:std::ptr::null::<std::ffi::c_void>()];

// Perform the request
let mut error_ptr: *mut Object = std::ptr::null_mut();
let success: bool = msg_send![handler, performRequests:&[request] error:&mut error_ptr];

if !success {
let error = if !error_ptr.is_null() {
let description: *const Object = msg_send![error_ptr, localizedDescription];
let cf_description = CFString::from_void(description as *const _);
cf_description.to_string()
} else {
"Unknown error".to_string()
};
return (
format!("Error performing OCR request: {}", error),
DataOutput {
data: vec![],
output: "".to_string(),
},
"".to_string(),
);
}

// Extract results
let results: *const Object = msg_send![request, results];
if results.is_null() {
return (
"Error: No results from OCR request".to_string(),
DataOutput {
data: vec![],
output: "".to_string(),
},
"".to_string(),
);
}

let count: usize = msg_send![results, count];
println!("Number of OCR results: {}", count);

let mut recognized_text = String::new();
for i in 0..count {
let observation: *const Object = msg_send![results, objectAtIndex:i];
if observation.is_null() {
println!("Warning: Null observation at index {}", i);
continue;
}

let text: *const Object = msg_send![observation, string];
if text.is_null() {
println!("Warning: Null text for observation at index {}", i);
continue;
}

let cf_string = CFString::from_void(text as *const _);
recognized_text.push_str(&cf_string.to_string());
recognized_text.push('\n');
}

let data_output = DataOutput {
data: vec![],
output: recognized_text.clone(),
};

let json_output = serde_json::json!({
"text": recognized_text,
"confidence": 1.0,
})
.to_string();

(recognized_text, data_output, json_output)
}
}
Loading

0 comments on commit a585f9b

Please sign in to comment.