Skip to content

Commit

Permalink
Merge pull request #103 from louis030195/apple-native-ocr
Browse files Browse the repository at this point in the history
feat: apple native ocr
  • Loading branch information
louis030195 committed Aug 8, 2024
2 parents ac64ddf + 35787e7 commit f0ddd56
Show file tree
Hide file tree
Showing 14 changed files with 246 additions and 24 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/release-cli.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,11 @@ jobs:
- name: Create deployment package
run: |
ls -R target/
tar -czf screenpipe-${{ env.VERSION }}-${{ matrix.target }}.tar.gz -C target/${{ matrix.target }}/release screenpipe
mkdir -p screenpipe-${{ env.VERSION }}-${{ matrix.target }}/bin
mkdir -p screenpipe-${{ env.VERSION }}-${{ matrix.target }}/lib
cp target/${{ matrix.target }}/release/screenpipe screenpipe-${{ env.VERSION }}-${{ matrix.target }}/bin/
cp target/${{ matrix.target }}/release/libscreenpipe.dylib screenpipe-${{ env.VERSION }}-${{ matrix.target }}/lib/
tar -czf screenpipe-${{ env.VERSION }}-${{ matrix.target }}.tar.gz -C screenpipe-${{ env.VERSION }}-${{ matrix.target }} .
- name: Calculate SHA256
run: |
Expand Down
3 changes: 2 additions & 1 deletion Formula/screenpipe.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ class Screenpipe < Formula
depends_on "tesseract"

def install
bin.install "screenpipe"
bin.install "screenpipe"
lib.install "libscreenpipe.dylib"
end

test do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
"ffmpeg/lib/libavutil.59.dylib",
"ffmpeg/lib/libffmpeg.7.dylib",
"ffmpeg/lib/libswresample.5.dylib",
"ffmpeg/lib/libswscale.8.dylib"
"ffmpeg/lib/libswscale.8.dylib",
"../../../../screenpipe-vision/lib/libscreenpipe.dylib"
],
"entitlements": "entitlements.plist",
"signingIdentity": "-",
Expand Down
23 changes: 13 additions & 10 deletions screenpipe-server/src/bin/screenpipe-server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ use crossbeam::queue::SegQueue;
use dirs::home_dir;
use log::{debug, error, info, LevelFilter};
use screenpipe_audio::{
default_input_device, default_output_device, list_audio_devices, parse_audio_device,
DeviceControl,
default_input_device, list_audio_devices, parse_audio_device, DeviceControl,
};
use screenpipe_vision::OcrEngine;
use std::io::Write;
Expand All @@ -34,6 +33,7 @@ enum CliOcrEngine {
Unstructured,
Tesseract,
WindowsNative,
AppleNative,
}

impl From<CliOcrEngine> for CoreOcrEngine {
Expand All @@ -42,6 +42,7 @@ impl From<CliOcrEngine> for CoreOcrEngine {
CliOcrEngine::Unstructured => CoreOcrEngine::Unstructured,
CliOcrEngine::Tesseract => CoreOcrEngine::Tesseract,
CliOcrEngine::WindowsNative => CoreOcrEngine::WindowsNative,
CliOcrEngine::AppleNative => CoreOcrEngine::AppleNative,
}
}
}
Expand Down Expand Up @@ -233,14 +234,16 @@ async fn main() -> anyhow::Result<()> {
}
// audio output only supported on linux atm
// see https://github.com/louis030195/screen-pipe/pull/106
#[cfg(target_os = "linux")]
if let Ok(output_device) = default_output_device() {
audio_devices.push(Arc::new(output_device.clone()));
let device_control = DeviceControl {
is_running: true,
is_paused: false,
};
devices_status.insert(output_device, device_control);
if cfg!(target_os = "linux") {
use screenpipe_audio::default_output_device;
if let Ok(output_device) = default_output_device() {
audio_devices.push(Arc::new(output_device.clone()));
let device_control = DeviceControl {
is_running: true,
is_paused: false,
};
devices_status.insert(output_device, device_control);
}
}
} else {
// Use specified devices
Expand Down
3 changes: 2 additions & 1 deletion screenpipe-server/src/video.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ use chrono::Utc;
use image::ImageFormat::{self};
use log::{debug, error, info, warn};
use screenpipe_core::find_ffmpeg_path;
use screenpipe_vision::{continuous_capture, get_monitor, CaptureResult, OcrEngine};
use screenpipe_vision::core::get_monitor;
use screenpipe_vision::{continuous_capture, CaptureResult, OcrEngine};
use std::collections::VecDeque;
use std::path::PathBuf;
use std::process::Stdio;
Expand Down
9 changes: 9 additions & 0 deletions screenpipe-vision/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ assert_cmd = "2.0.14"
predicates = "3.1.0"
assert_fs = "1.1.1"


[build-dependencies]
cc = "1.0"

[package.metadata.osx]
framework = ["Vision", "AppKit"]

[[bin]]
name = "screenpipe-vision"
path = "src/bin/screenpipe-vision.rs"
Expand All @@ -78,3 +85,5 @@ harness = false
[target.'cfg(target_os = "windows")'.dependencies]
windows = { version = "0.58", features = ["Graphics_Imaging", "Media_Ocr", "Storage", "Storage_Streams"] }

[target.'cfg(target_os = "macos")'.dependencies]
libc = "0.2"
19 changes: 19 additions & 0 deletions screenpipe-vision/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use std::env;

#[cfg(target_os = "macos")]
fn main() {
let destination = env::var("DESTINATION").unwrap_or_default();

if destination == "brew" {
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../lib");
} else if destination == "tauri" {
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../Frameworks");
} else {
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../../screenpipe-vision/lib");
}

println!("cargo:rustc-link-lib=dylib=screenpipe");
}

#[cfg(not(target_os = "macos"))]
fn main() {}
31 changes: 31 additions & 0 deletions screenpipe-vision/src/apple.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use image::DynamicImage;
use std::ffi::CStr;
use std::os::raw::{c_char, c_uchar};

#[link(name = "screenpipe")]
extern "C" {
fn perform_ocr(
image_data: *const c_uchar,
length: usize,
width: i32,
height: i32,
) -> *mut c_char;
}

pub fn perform_ocr_apple(image: &DynamicImage) -> String {
let rgba = image.to_rgba8();
let (width, height) = rgba.dimensions();
let raw_data = rgba.as_raw();

unsafe {
let result_ptr = perform_ocr(
raw_data.as_ptr(),
raw_data.len(),
width as i32,
height as i32,
);
let result = CStr::from_ptr(result_ptr).to_string_lossy().into_owned();
libc::free(result_ptr as *mut libc::c_void);
result
}
}
2 changes: 1 addition & 1 deletion screenpipe-vision/src/bin/screenpipe-vision.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use clap::Parser;
use screenpipe_vision::{continuous_capture, get_monitor, OcrEngine};
use screenpipe_vision::{continuous_capture, core::get_monitor, OcrEngine};
use std::{sync::Arc, time::Duration};
use tokio::sync::mpsc::channel;

Expand Down
38 changes: 32 additions & 6 deletions screenpipe-vision/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,21 @@ use strsim::levenshtein;
use tokio::sync::{mpsc::Sender, Mutex}; // Corrected import for Mutex
use xcap::{Monitor, Window};

#[cfg(target_os = "macos")]
use crate::apple::perform_ocr_apple;
#[cfg(target_os = "windows")]
use crate::utils::perform_ocr_windows;
use crate::utils::OcrEngine;
use crate::utils::{
capture_screenshot, compare_with_previous_image, perform_ocr_tesseract,
save_text_files,
capture_screenshot, compare_with_previous_image, perform_ocr_tesseract, save_text_files,
};
use rusty_tesseract::{Data, DataOutput}; // Add this import
use rusty_tesseract::{Data, DataOutput};
use screenpipe_integrations::unstructured_ocr::perform_ocr_cloud;
pub enum ControlMessage {
Pause,
Resume,
Stop,
}

pub struct DataOutputWrapper {
pub data_output: rusty_tesseract::tesseract::output_data::DataOutput,
Expand Down Expand Up @@ -135,7 +141,10 @@ pub async fn continuous_capture(

// Skip the frame if the current average difference is less than 0.006
if current_average < 0.006 {
debug!("Skipping frame {} due to low average difference: {:.3}", frame_counter, current_average);
debug!(
"Skipping frame {} due to low average difference: {:.3}",
frame_counter, current_average
);
frame_counter += 1;
tokio::time::sleep(interval).await;
continue;
Expand Down Expand Up @@ -194,7 +203,7 @@ pub async fn continuous_capture(
});

frame_counter = 0; // Reset frame_counter after OCR task is processed
// Reset max_average and max_avg_value after spawning the OCR task
// Reset max_average and max_avg_value after spawning the OCR task
max_avg_value = 0.0;
}
}
Expand Down Expand Up @@ -244,6 +253,23 @@ pub async fn process_ocr_task(
debug!("Windows Native OCR");
perform_ocr_windows(&image_arc).await
}
#[cfg(target_os = "macos")]
OcrEngine::AppleNative => {
debug!("Apple Native OCR");
let text = perform_ocr_apple(&image_arc);
(
text.clone(),
DataOutput {
output: String::new(),
data: vec![],
},
serde_json::json!([{
"text": text,
"confidence": "1.0",
}])
.to_string(),
)
}
_ => {
error!("Unsupported OCR engine");
return Err(std::io::Error::new(
Expand Down Expand Up @@ -324,4 +350,4 @@ pub async fn process_ocr_task(
frame_number, _duration
);
Ok(())
}
}
5 changes: 4 additions & 1 deletion screenpipe-vision/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
pub mod apple;
pub mod core;
pub mod utils;
pub use core::{continuous_capture, get_monitor, process_ocr_task, CaptureResult};
#[cfg(target_os = "macos")]
pub use apple::perform_ocr_apple;
pub use core::{continuous_capture, process_ocr_task, CaptureResult, ControlMessage};
pub use utils::{perform_ocr_tesseract, OcrEngine};
87 changes: 87 additions & 0 deletions screenpipe-vision/src/ocr.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import CoreGraphics
import Foundation
import Vision

@_cdecl("perform_ocr")
public func performOCR(imageData: UnsafePointer<UInt8>, length: Int, width: Int, height: Int)
-> UnsafeMutablePointer<CChar>? {

// print("Attempting to create image from raw data")
// print("Image dimensions: \(width)x\(height)")

guard let dataProvider = CGDataProvider(data: Data(bytes: imageData, count: length) as CFData)
else {
// print("Failed to create CGDataProvider.")
return strdup("Error: Failed to create CGDataProvider")
}

guard
let cgImage = CGImage(
width: width,
height: height,
bitsPerComponent: 8,
bitsPerPixel: 32,
bytesPerRow: width * 4,
space: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.premultipliedLast.rawValue),
provider: dataProvider,
decode: nil,
shouldInterpolate: false,
intent: .defaultIntent
)
else {
// print("Failed to create CGImage.")
return strdup("Error: Failed to create CGImage")
}

// print("CGImage created successfully.")

let semaphore = DispatchSemaphore(value: 0)
var ocrResult = ""

let request = VNRecognizeTextRequest { request, error in
defer { semaphore.signal() }

if let error = error {
// print("Error in text recognition request: \(error)")
ocrResult = "Error: \(error.localizedDescription)"
return
}

guard let observations = request.results as? [VNRecognizedTextObservation] else {
// print("Failed to process image or no text found.")
ocrResult = "Error: Failed to process image or no text found"
return
}

// print("Number of text observations: \(observations.count)")

for (_, observation) in observations.enumerated() {
guard let topCandidate = observation.topCandidates(1).first else {
// print("No top candidate for observation \(index)")
continue
}
ocrResult += "\(topCandidate.string)\n"
}
}

request.recognitionLevel = .accurate

let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
do {
// print("Performing OCR...")
try handler.perform([request])
} catch {
// print("Failed to perform OCR: \(error)")
return strdup("Error: Failed to perform OCR - \(error.localizedDescription)")
}

semaphore.wait()

return strdup(ocrResult.isEmpty ? "No text found" : ocrResult)
}

// swiftc -emit-library -o screenpipe-vision/lib/libscreenpipe.dylib screenpipe-vision/src/ocr.swift

// or
// swiftc -emit-library -o /usr/local/lib/libscreenpipe.dylib screenpipe-vision/src/ocr.swift
3 changes: 2 additions & 1 deletion screenpipe-vision/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::core::MaxAverageFrame; // Assuming core.rs is in the same crate under the `core` module
use crate::core::MaxAverageFrame;
use image::DynamicImage;
use image_compare::{Algorithm, Metric, Similarity}; // Added import for Similarity
use log::{debug, error};
Expand All @@ -18,6 +18,7 @@ pub enum OcrEngine {
Unstructured,
Tesseract,
WindowsNative,
AppleNative,
}

impl Default for OcrEngine {
Expand Down
37 changes: 37 additions & 0 deletions screenpipe-vision/tests/apple_vision_test.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#[cfg(target_os = "macos")]
#[cfg(test)]
mod tests {
use image::GenericImageView;
use screenpipe_vision::perform_ocr_apple;
use std::path::PathBuf;

#[tokio::test]
async fn test_apple_native_ocr() {
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("tests");
path.push("testing_OCR.png");
println!("Path to testing_OCR.png: {:?}", path);

// Check if file exists and print its size
if let Ok(metadata) = std::fs::metadata(&path) {
println!("File size: {} bytes", metadata.len());
}

// Attempt to open the image
let image = image::open(&path).expect("Failed to open image");
println!("Image dimensions: {:?}", image.dimensions());

// Convert image to RGB format
let rgb_image = image.to_rgb8();
println!("RGB image dimensions: {:?}", rgb_image.dimensions());

let result = perform_ocr_apple(&image);

println!("OCR text: {:?}", result);
assert!(
result.contains("ocr_tx.receiver_count"),
"OCR failed: {:?}",
result
);
}
}

0 comments on commit f0ddd56

Please sign in to comment.