Merge pull request #1 from g4-api/development

OCR improvements
g4-api · Jun 3, 2024 · 84bdc0a · 84bdc0a
2 parents 4fb72d5 + 00a94af
commit 84bdc0a
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 16 deletions.
diff --git a/src/OcrInspector/MainWindow.xaml b/src/OcrInspector/MainWindow.xaml
@@ -40,10 +40,19 @@
                 HorizontalAlignment="Left"
                 VerticalAlignment="Top"
                 Width="100"
-                Margin="10"
+                Margin="10,10,0,0"
                 Click="BtnLoadImage_Click"
                 Grid.Row="1"/>
 
+        <Button Name="BtnTakeScreenshot"
+                Content="Take Screenshot..."
+                HorizontalAlignment="Center"
+                VerticalAlignment="Top"
+                Width="100"
+                Margin="10,10,0,0"
+                Click="BtnTakeScreenshot_Click"
+                Grid.Row="1"/>
+
         <ScrollViewer Name="MainScrollViewer"
                       HorizontalAlignment="Stretch"
                       VerticalAlignment="Stretch"

diff --git a/src/OcrInspector/MainWindow.xaml.cs b/src/OcrInspector/MainWindow.xaml.cs
@@ -1,17 +1,21 @@
-using Emgu.CV;
-using Emgu.CV.OCR;
-using Emgu.CV.Structure;
-
-using Microsoft.Win32;
-
-using System;
+using System;
 using System.Collections.Generic;
+using System.Drawing;
 using System.Linq;
+using System.Runtime.InteropServices;
+using System.Threading;
 using System.Windows;
 using System.Windows.Controls;
 using System.Windows.Media;
 using System.Windows.Media.Imaging;
-using System.Windows.Shapes;
+
+using Emgu.CV;
+using Emgu.CV.OCR;
+using Emgu.CV.Structure;
+
+using Microsoft.Win32;
+
+
 
 namespace OcrInspector
 {
@@ -83,18 +87,71 @@ private void BtnLoadImage_Click(object sender, RoutedEventArgs e)
             MainCanvas.Height = bitmap.PixelHeight;
         }
 
+        // Event handler for the Take Screenshot button click event to take a screenshot of the main monitor
+        // to process using Tesseract OCR and display in the window UI elements
+        private void BtnTakeScreenshot_Click(object sender, RoutedEventArgs e)
+        {
+            // Prevent app window to be seen in the screenshot.
+            Application.Current.MainWindow.Hide();
+            Thread.Sleep(500);
+
+            // Take screenshot of main display.
+            const int ENUM_CURRENT_SETTINGS = -1;
+            DevMode devMode = default;
+            devMode.dmSize = (short)Marshal.SizeOf(devMode);
+            EnumDisplaySettings(null, ENUM_CURRENT_SETTINGS, ref devMode);
+            var bitmap = new Bitmap(devMode.dmPelsWidth,
+                                       devMode.dmPelsHeight,
+                                       System.Drawing.Imaging.PixelFormat.Format32bppArgb);
+
+
+            var gfxScreenshot = Graphics.FromImage(bitmap);
+            gfxScreenshot.CopyFromScreen(devMode.dmPositionX, devMode.dmPositionY, 0, 0, bitmap.Size);
+            Application.Current.MainWindow.Show();
+
+            // Increase DPI for better accuracy.
+            bitmap.SetResolution(300, 300);
+
+            // Process the image using Tesseract OCR and get the recognized words along with the
+            // processed image with bounding boxes around the recognized words drawn on it
+            var resolvedImage = ResolveWords(bitmap);
+
+            // Clear the existing clickable points on the MainCanvas to display the new image and recognized words
+            MainCanvas.Children.RemoveRange(1, MainCanvas.Children.Count - 2);
+
+            // Add clickable points to the MainCanvas at the locations of the recognized words
+            foreach (var word in resolvedImage.Words)
+            {
+                AddClickablePoint(word);
+            }
+
+            // Display the image and the recognized words in the UI elements of the window
+            MainImage.Source = resolvedImage.ImageSource;
+
+            // Set the image size to its original size to display the image correctly in
+            // the window without distortion or cropping issues
+            MainImage.Width = bitmap.Width;
+            MainImage.Height = bitmap.Height;
+
+            // Set the width of the MainCanvas to match the width of the MainImage
+            MainCanvas.Width = bitmap.Width;
+
+            // Set the height of the MainCanvas to match the height of the MainImage
+            MainCanvas.Height = bitmap.Height;
+        }
+
         // Processes the image at the given path using Tesseract OCR, converts it to grayscale, recognizes the text,
         // draws bounding boxes around the recognized words, and returns the processed image along with the list of recognized words.
         private static (ImageSource ImageSource, List<Tesseract.Word> Words) ResolveWords(string imagePath)
         {
             // Load the image
-            var image = new Image<Bgr, byte>(imagePath);
+            using var image = new Image<Bgr, byte>(imagePath);
 
             // Initialize Tesseract OCR
-            var ocr = new Tesseract("TrainData/", "eng", OcrEngineMode.TesseractOnly);
-
+            using var ocr = new Tesseract("TrainData/", "eng", OcrEngineMode.Default);
+            ocr.PageSegMode = Emgu.CV.OCR.PageSegMode.SparseText;
             // Convert to grayscale for OCR processing (Tesseract requires a grayscale image)
-            var grayImage = image.Convert<Gray, byte>();
+            using var grayImage = image.Convert<Gray, byte>();
 
             // Perform OCR on the grayscale image and get the recognized words
             ocr.SetImage(grayImage);
@@ -107,6 +164,29 @@ private static (ImageSource ImageSource, List<Tesseract.Word> Words) ResolveWord
             return (image.ToBitmapSource(), words);
         }
 
+        // Converts the provided Bitmap for processing with Tesseract OCR, converts it to grayscale, recognizes the text,
+        // draws bounding boxes around the recognized words, and returns the processed image along with the list of recognized words.
+        private static (ImageSource ImageSource, List<Tesseract.Word> Words) ResolveWords(Bitmap bitmap)
+        {
+            // Load the image
+            using var image = bitmap.ToImage<Bgr, byte>();
+            // Initialize Tesseract OCR
+            using var ocr = new Tesseract("TrainData/", "eng", OcrEngineMode.Default);
+            ocr.PageSegMode = Emgu.CV.OCR.PageSegMode.SparseText;
+            // Convert to grayscale for OCR processing (Tesseract requires a grayscale image)
+            using var grayImage = image.Convert<Gray, byte>();
+
+            // Perform OCR on the grayscale image and get the recognized words
+            ocr.SetImage(grayImage);
+            ocr.Recognize();
+
+            // Get the words from the OCR result
+            var words = ocr.GetWords().ToList();
+
+            // Save or display the image
+            return (grayImage.ToBitmapSource(), words);
+        }
+
         // Adds a clickable point to the MainCanvas at the location of the given word.
         // The point displays a tooltip with the word text and accuracy.
         private void AddClickablePoint(Tesseract.Word word)
@@ -126,11 +206,11 @@ private void AddClickablePoint(Tesseract.Word word)
             };
 
             // Create a rectangle to represent the clickable point on the image
-            Rectangle rectangle = new()
+            System.Windows.Shapes.Rectangle rectangle = new()
             {
-                Fill = Brushes.Transparent,
+                Fill = System.Windows.Media.Brushes.Transparent,
                 Height = word.Region.Height,
-                Stroke = Brushes.Blue,
+                Stroke = System.Windows.Media.Brushes.Blue,
                 StrokeThickness = 1,
                 Tag = word.Region.Location,
                 ToolTip = tooltip,
@@ -155,5 +235,49 @@ private void AddClickablePoint(Tesseract.Word word)
             // Add the rectangle to the MainCanvas to display the clickable point on the image
             MainCanvas.Children.Add(rectangle);
         }
+
+        #region ExternalMethods
+        [DllImport("user32.dll")]
+        static extern bool EnumDisplaySettings(string deviceName, int modeNum, ref DevMode devMode);
+
+        [StructLayout(LayoutKind.Sequential)]
+        struct DevMode
+        {
+            [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 0x20)]
+            public string dmDeviceName;
+            public short dmSpecVersion;
+            public short dmDriverVersion;
+            public short dmSize;
+            public short dmDriverExtra;
+            public int dmFields;
+            public int dmPositionX;
+            public int dmPositionY;
+            public int dmDisplayOrientation;
+            public int dmDisplayFixedOutput;
+            public short dmColor;
+            public short dmDuplex;
+            public short dmYResolution;
+            public short dmTTOption;
+            public short dmCollate;
+            [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 0x20)]
+            public string dmFormName;
+            public short dmLogPixels;
+            public int dmBitsPerPel;
+            public int dmPelsWidth;
+            public int dmPelsHeight;
+            public int dmDisplayFlags;
+            public int dmDisplayFrequency;
+            public int dmICMMethod;
+            public int dmICMIntent;
+            public int dmMediaType;
+            public int dmDitherType;
+            public int dmReserved1;
+            public int dmReserved2;
+            public int dmPanningWidth;
+            public int dmPanningHeight;
+        }
+        #endregion
     }
+
+
 }
diff --git a/src/OcrInspector/TrainData/eng.traineddata b/src/OcrInspector/TrainData/eng.traineddata