From 682099f8975a6c55bdb5636bebf14542641216ef Mon Sep 17 00:00:00 2001
From: Saumya Shah <shahsaumya0108@gmail.com>
Date: Thu, 5 Apr 2018 01:42:25 +0530
Subject: [PATCH 1/2] Probate Parsing Solution 1. OCR using pytesseract -
 ocr.py 2. Named Entity Recognition using NLTK and Stanford NLP Wrapper 
 a)NLTK - nltk_ner.py 	b)Stanford NLP - stanford_ner.py 3. To get a good idea
 of prerequisites and execution details - README.md

---
 wills/README.md       | 28 +++++++++++++++++++++
 wills/nltk_ner.py     | 57 +++++++++++++++++++++++++++++++++++++++++++
 wills/ocr.py          | 46 ++++++++++++++++++++++++++++++++++
 wills/stanford_ner.py | 51 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 182 insertions(+)
 create mode 100644 wills/README.md
 create mode 100644 wills/nltk_ner.py
 create mode 100644 wills/ocr.py
 create mode 100644 wills/stanford_ner.py
diff --git a/wills/README.md b/wills/README.md
new file mode 100644
index 0000000..0ac1cab
--- /dev/null
+++ b/wills/README.md
@@ -0,0 +1,28 @@
+## Prerequisites and how to run the project
+
+### Prerequisites
+To execute the project, following libraries would be required
+
+- [Tesseract] (https://github.com/tesseract-ocr/tesseract) -  You can download it using ```sudo apt-get install tesseract```. To run the tesseract application on Python, another library needs to be installed using ```pip``` called ```pytesseract```. Simply, ```pip3 install pytesseract```.
+
+- [Pillow] (https://pillow.readthedocs.io/en/5.1.x/) - To download Python's Imaging Library, you can use ```pip3 install pillow```
+
+- [Tkinter] (https://docs.python.org/3/library/tkinter.html) - It a Python binder to the Tk GUI. To view the results of the NER which is viewed as a graph you call install python-tkinter using ```sudo apt-get install python3-tkinter```.
+
+- [Natural Language Toolkit (NLTK)] (http://www.nltk.org/) - To run the Named Entity Recognition, you need to make use of Python's Standard Library for Natural Language Processing. You can install it by simply, ```pip3 install nltk```.
+    - You would need to download the standard chunkers, taggers for entity recognition  namely ```punkt```, ```averaged_perceptron_taggers```, ```maxent_ne_chunker``` and ```words```. For that - run a python shell in your terminal and follow the steps below :  
+        ``` python
+            import nltk
+            nltk.download('punkt')
+            nltk.download('words')
+            nltk.download('maxent_ne_chunker')
+            nltk.download('averaged_perceptron_taggers')
+        ```
+- Stanford NER Library (https://nlp.stanford.edu/software/CRF-NER.shtml#Download) - You can download the 7 class recognizer from the link and move it to the ```/usr/bin``` folder. Please ensure that you have ```java```, ```jre``` and ```jdk``` installed to run this wrapper library.
+
+### Execute and Run
+To run the NLTK Standard NER Library, in the terminal, type ```python3 nltk_ner.py -i <path to image here>```.
+
+To run the Stanford NER Tagger, in the terminal type ```python3 stanford_ner.py -i <path to image here>```.
+
+#### As an output, all the will entries of the scanned image is retrieved along with their named entities classified. 
diff --git a/wills/nltk_ner.py b/wills/nltk_ner.py
new file mode 100644
index 0000000..3f53c1b
--- /dev/null
+++ b/wills/nltk_ner.py
@@ -0,0 +1,57 @@
+from PIL import Image
+import pytesseract
+import argparse
+import cv2
+import os
+import csv
+import nltk
+
+ocr = []
+
+
+def processLanguage(contentArray):
+    try:
+        tokenized = nltk.word_tokenize(contentArray)
+        tagged = nltk.pos_tag(tokenized)
+        print(tagged)
+
+        namedEnt = nltk.ne_chunk(tagged)
+        namedEnt.draw()
+
+    except Exception as e:
+        print(str(e))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-i", "--image", required=True,
+                help="path to input image to be OCR'd")
+args = vars(ap.parse_args())
+
+reader = csv.reader(open('data/gold/extracted_wills.csv'))
+for row in reader:
+    filename = args["image"].split('/')[-1].split('.')[0]
+    if(row[0] == filename):
+
+        # extract the (x,y) rectangular coordinates for each entry in the probate book
+        x1 = int(row[2])
+        y1 = int(row[3])
+        x2 = int(row[4])
+        y2 = int(row[5])
+
+        # load the example image and convert it to grayscale and crop
+        image = cv2.imread(args["image"])
+        image = image[y1:y2, x1:x2]
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+        # write the grayscale image to disk as a temporary file so we can
+        # apply OCR to it
+        filename1 = "{}.png".format(os.getpid())
+        cv2.imwrite(filename1, gray)
+
+        # load the image as a PIL/Pillow image, apply OCR, and then delete
+        # the temporary file
+        text = pytesseract.image_to_string(Image.open(filename1))
+        print(text)
+        os.remove(filename1)
+        # apply nltk entity recognizer to the data extracted by ocr
+        processLanguage(text)
diff --git a/wills/ocr.py b/wills/ocr.py
new file mode 100644
index 0000000..71b88a5
--- /dev/null
+++ b/wills/ocr.py
@@ -0,0 +1,46 @@
+from PIL import Image
+import pytesseract
+import argparse
+import cv2
+import os
+
+# construct the argument parse and parse the arguments
+ap = argparse.ArgumentParser()
+ap.add_argument("-i", "--image", required=True,
+                help="path to input image to be OCR'd")
+ap.add_argument("-p", "--preprocess", type=str, default="thresh",
+                help="type of preprocessing to be done")
+args = vars(ap.parse_args())
+
+# load the example image and convert it to grayscale
+image = cv2.imread(args["image"])
+gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+cv2.imshow("Image", gray)
+
+# check to see if we should apply thresholding to preprocess the
+# image
+if args["preprocess"] == "thresh":
+    gray = cv2.threshold(gray, 0, 255,
+                         cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+
+# make a check to see if median blurring should be done to remove
+# noise
+elif args["preprocess"] == "blur":
+    gray = cv2.medianBlur(gray, 3)
+
+# write the grayscale image to disk as a temporary file so we can
+# apply OCR to it
+filename = "{}.png".format(os.getpid())
+cv2.imwrite(filename, gray)
+
+# load the image as a PIL/Pillow image, apply OCR, and then delete
+# the temporary file
+text = pytesseract.image_to_string(Image.open(filename))
+os.remove(filename)
+print(text)
+
+# show the output images
+# cv2.imshow("Image", image)
+cv2.imshow("Output", gray)
+cv2.waitKey(0)
diff --git a/wills/stanford_ner.py b/wills/stanford_ner.py
new file mode 100644
index 0000000..bff3fc1
--- /dev/null
+++ b/wills/stanford_ner.py
@@ -0,0 +1,51 @@
+from PIL import Image
+import pytesseract
+import argparse
+import cv2
+import os
+import csv
+from nltk.tag import StanfordNERTagger
+from nltk.tokenize import word_tokenize
+
+ocr = []
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-i", "--image", required=True,
+                help="path to input image to be OCR'd")
+args = vars(ap.parse_args())
+
+reader = csv.reader(open('data/gold/extracted_wills.csv'))
+for row in reader:
+    filename = args["image"].split('/')[-1].split('.')[0]
+    if(row[0] == filename):
+
+        # extract the (x,y) rectangular coordinates for each entry in the probate book
+        x1 = int(row[2])
+        y1 = int(row[3])
+        x2 = int(row[4])
+        y2 = int(row[5])
+
+        # load the example image and convert it to grayscale and crop
+        image = cv2.imread(args["image"])
+        image = image[y1:y2, x1:x2]
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+        # write the grayscale image to disk as a temporary file so we can
+        # apply OCR to it
+        filename1 = "{}.png".format(os.getpid())
+        cv2.imwrite(filename1, gray)
+
+        # load the image as a PIL/Pillow image, apply OCR, and then delete
+        # the temporary file
+        text = pytesseract.image_to_string(Image.open(filename1))
+        print(text)
+        os.remove(filename1)
+
+        # apply stanford entity recognizer to the data extracted by ocr
+
+        st = StanfordNERTagger('/usr/bin/stanford-ner-2018-02-27/classifiers/english.muc.7class.distsim.crf.ser.gz', '/usr/bin/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8')
+
+        tokenized_text = word_tokenize(text)
+        classified_text = st.tag(tokenized_text)
+
+        print(classified_text)

From bd5a0ee471d9fef5e84944d112a4ea5101d76824 Mon Sep 17 00:00:00 2001
From: Saumya Shah <shahsaumya0108@gmail.com>
Date: Thu, 5 Apr 2018 01:50:20 +0530
Subject: [PATCH 2/2] Fixes in README

---
 wills/README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/wills/README.md b/wills/README.md
index 0ac1cab..4d8a868 100644
--- a/wills/README.md
+++ b/wills/README.md
@@ -1,15 +1,15 @@
 ## Prerequisites and how to run the project
 
 ### Prerequisites
-To execute the project, following libraries would be required
+To execute the project, following libraries would be required:
 
-- [Tesseract] (https://github.com/tesseract-ocr/tesseract) -  You can download it using ```sudo apt-get install tesseract```. To run the tesseract application on Python, another library needs to be installed using ```pip``` called ```pytesseract```. Simply, ```pip3 install pytesseract```.
+- [Tesseract](https://github.com/tesseract-ocr/tesseract) -  You can download it using ```sudo apt-get install tesseract```. To run the tesseract application on Python, another library needs to be installed using ```pip``` called ```pytesseract```. Simply, ```pip3 install pytesseract```.
 
-- [Pillow] (https://pillow.readthedocs.io/en/5.1.x/) - To download Python's Imaging Library, you can use ```pip3 install pillow```
+- [Pillow](https://pillow.readthedocs.io/en/5.1.x/) - To download Python's Imaging Library, you can use ```pip3 install pillow```
 
-- [Tkinter] (https://docs.python.org/3/library/tkinter.html) - It a Python binder to the Tk GUI. To view the results of the NER which is viewed as a graph you call install python-tkinter using ```sudo apt-get install python3-tkinter```.
+- [Tkinter](https://docs.python.org/3/library/tkinter.html) - It a Python binder to the Tk GUI. To view the results of the NER which is viewed as a graph you call install python-tkinter using ```sudo apt-get install python3-tkinter```.
 
-- [Natural Language Toolkit (NLTK)] (http://www.nltk.org/) - To run the Named Entity Recognition, you need to make use of Python's Standard Library for Natural Language Processing. You can install it by simply, ```pip3 install nltk```.
+- [Natural Language Toolkit (NLTK)](http://www.nltk.org/) - To run the Named Entity Recognition, you need to make use of Python's Standard Library for Natural Language Processing. You can install it by simply, ```pip3 install nltk```.
     - You would need to download the standard chunkers, taggers for entity recognition  namely ```punkt```, ```averaged_perceptron_taggers```, ```maxent_ne_chunker``` and ```words```. For that - run a python shell in your terminal and follow the steps below :  
         ``` python
             import nltk
@@ -18,11 +18,11 @@ To execute the project, following libraries would be required
             nltk.download('maxent_ne_chunker')
             nltk.download('averaged_perceptron_taggers')
         ```
-- Stanford NER Library (https://nlp.stanford.edu/software/CRF-NER.shtml#Download) - You can download the 7 class recognizer from the link and move it to the ```/usr/bin``` folder. Please ensure that you have ```java```, ```jre``` and ```jdk``` installed to run this wrapper library.
+- [Stanford NER Library](https://nlp.stanford.edu/software/CRF-NER.shtml#Download) - You can download the 7 class recognizer from the link and move it to the ```/usr/bin``` folder. Please ensure that you have ```java```, ```jre``` and ```jdk``` installed to run this wrapper library.
 
 ### Execute and Run
 To run the NLTK Standard NER Library, in the terminal, type ```python3 nltk_ner.py -i <path to image here>```.
 
 To run the Stanford NER Tagger, in the terminal type ```python3 stanford_ner.py -i <path to image here>```.
 
-#### As an output, all the will entries of the scanned image is retrieved along with their named entities classified. 
+#### As an output, all the will entries of the scanned image is retrieved along with their named entities classified.