Added PDF Extractor

sumanth-0 · Oct 26, 2024 · 16a55c6 · 16a55c6
1 parent 90ce57f
commit 16a55c6
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 0 deletions.
diff --git a/pdf-extractor/README.md b/pdf-extractor/README.md
@@ -0,0 +1,18 @@
+# PDF Text Extraction Script
+
+This script extracts text from PDF files using Optical Character Recognition (OCR) with the `pytesseract` library. It converts each page of a PDF document into an image and then extracts the text from those images.
+
+## Features
+
+- Converts each page of a PDF into an image.
+- Uses Tesseract OCR to extract text from images.
+- Outputs the extracted text to the console and saves it to a text file.
+
+## Requirements
+
+- Python 3.x
+- Libraries:
+  - `pytesseract`
+  - `pdf2image`
+  - `Pillow`
+
diff --git a/pdf-extractor/extract_pdf.py b/pdf-extractor/extract_pdf.py
@@ -0,0 +1,34 @@
+import pytesseract
+from pdf2image import convert_from_path
+import os
+
+# Configure the path to the Tesseract executable if it's not in your PATH
+# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+
+def extract_text_from_pdf(pdf_path):
+    # Convert PDF to images
+    images = convert_from_path(pdf_path)
+
+    # Initialize an empty string to store text
+    extracted_text = ""
+
+    # Loop through each image
+    for i, image in enumerate(images):
+        # Use pytesseract to do OCR on the image
+        text = pytesseract.image_to_string(image)
+        extracted_text += f"--- Page {i + 1} ---\n{text}\n\n"
+
+    return extracted_text
+
+# Path to your PDF file
+pdf_file_path = 'path/to/your/file.pdf'
+
+# Extract text from the PDF
+text = extract_text_from_pdf(pdf_file_path)
+
+# Display the extracted text
+print(text)
+
+# Optionally save to a text file
+with open('extracted_text.txt', 'w', encoding='utf-8') as f:
+    f.write(text)