From 682099f8975a6c55bdb5636bebf14542641216ef Mon Sep 17 00:00:00 2001 From: Saumya Shah Date: Thu, 5 Apr 2018 01:42:25 +0530 Subject: [PATCH 1/2] Probate Parsing Solution 1. OCR using pytesseract - ocr.py 2. Named Entity Recognition using NLTK and Stanford NLP Wrapper a)NLTK - nltk_ner.py b)Stanford NLP - stanford_ner.py 3. To get a good idea of prerequisites and execution details - README.md --- wills/README.md | 28 +++++++++++++++++++++ wills/nltk_ner.py | 57 +++++++++++++++++++++++++++++++++++++++++++ wills/ocr.py | 46 ++++++++++++++++++++++++++++++++++ wills/stanford_ner.py | 51 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 182 insertions(+) create mode 100644 wills/README.md create mode 100644 wills/nltk_ner.py create mode 100644 wills/ocr.py create mode 100644 wills/stanford_ner.py diff --git a/wills/README.md b/wills/README.md new file mode 100644 index 0000000..0ac1cab --- /dev/null +++ b/wills/README.md @@ -0,0 +1,28 @@ +## Prerequisites and how to run the project + +### Prerequisites +To execute the project, following libraries would be required + +- [Tesseract] (https://github.com/tesseract-ocr/tesseract) - You can download it using ```sudo apt-get install tesseract```. To run the tesseract application on Python, another library needs to be installed using ```pip``` called ```pytesseract```. Simply, ```pip3 install pytesseract```. + +- [Pillow] (https://pillow.readthedocs.io/en/5.1.x/) - To download Python's Imaging Library, you can use ```pip3 install pillow``` + +- [Tkinter] (https://docs.python.org/3/library/tkinter.html) - It a Python binder to the Tk GUI. To view the results of the NER which is viewed as a graph you call install python-tkinter using ```sudo apt-get install python3-tkinter```. + +- [Natural Language Toolkit (NLTK)] (http://www.nltk.org/) - To run the Named Entity Recognition, you need to make use of Python's Standard Library for Natural Language Processing. You can install it by simply, ```pip3 install nltk```. + - You would need to download the standard chunkers, taggers for entity recognition namely ```punkt```, ```averaged_perceptron_taggers```, ```maxent_ne_chunker``` and ```words```. For that - run a python shell in your terminal and follow the steps below : + ``` python + import nltk + nltk.download('punkt') + nltk.download('words') + nltk.download('maxent_ne_chunker') + nltk.download('averaged_perceptron_taggers') + ``` +- Stanford NER Library (https://nlp.stanford.edu/software/CRF-NER.shtml#Download) - You can download the 7 class recognizer from the link and move it to the ```/usr/bin``` folder. Please ensure that you have ```java```, ```jre``` and ```jdk``` installed to run this wrapper library. + +### Execute and Run +To run the NLTK Standard NER Library, in the terminal, type ```python3 nltk_ner.py -i ```. + +To run the Stanford NER Tagger, in the terminal type ```python3 stanford_ner.py -i ```. + +#### As an output, all the will entries of the scanned image is retrieved along with their named entities classified. diff --git a/wills/nltk_ner.py b/wills/nltk_ner.py new file mode 100644 index 0000000..3f53c1b --- /dev/null +++ b/wills/nltk_ner.py @@ -0,0 +1,57 @@ +from PIL import Image +import pytesseract +import argparse +import cv2 +import os +import csv +import nltk + +ocr = [] + + +def processLanguage(contentArray): + try: + tokenized = nltk.word_tokenize(contentArray) + tagged = nltk.pos_tag(tokenized) + print(tagged) + + namedEnt = nltk.ne_chunk(tagged) + namedEnt.draw() + + except Exception as e: + print(str(e)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-i", "--image", required=True, + help="path to input image to be OCR'd") +args = vars(ap.parse_args()) + +reader = csv.reader(open('data/gold/extracted_wills.csv')) +for row in reader: + filename = args["image"].split('/')[-1].split('.')[0] + if(row[0] == filename): + + # extract the (x,y) rectangular coordinates for each entry in the probate book + x1 = int(row[2]) + y1 = int(row[3]) + x2 = int(row[4]) + y2 = int(row[5]) + + # load the example image and convert it to grayscale and crop + image = cv2.imread(args["image"]) + image = image[y1:y2, x1:x2] + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + # write the grayscale image to disk as a temporary file so we can + # apply OCR to it + filename1 = "{}.png".format(os.getpid()) + cv2.imwrite(filename1, gray) + + # load the image as a PIL/Pillow image, apply OCR, and then delete + # the temporary file + text = pytesseract.image_to_string(Image.open(filename1)) + print(text) + os.remove(filename1) + # apply nltk entity recognizer to the data extracted by ocr + processLanguage(text) diff --git a/wills/ocr.py b/wills/ocr.py new file mode 100644 index 0000000..71b88a5 --- /dev/null +++ b/wills/ocr.py @@ -0,0 +1,46 @@ +from PIL import Image +import pytesseract +import argparse +import cv2 +import os + +# construct the argument parse and parse the arguments +ap = argparse.ArgumentParser() +ap.add_argument("-i", "--image", required=True, + help="path to input image to be OCR'd") +ap.add_argument("-p", "--preprocess", type=str, default="thresh", + help="type of preprocessing to be done") +args = vars(ap.parse_args()) + +# load the example image and convert it to grayscale +image = cv2.imread(args["image"]) +gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + +cv2.imshow("Image", gray) + +# check to see if we should apply thresholding to preprocess the +# image +if args["preprocess"] == "thresh": + gray = cv2.threshold(gray, 0, 255, + cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + +# make a check to see if median blurring should be done to remove +# noise +elif args["preprocess"] == "blur": + gray = cv2.medianBlur(gray, 3) + +# write the grayscale image to disk as a temporary file so we can +# apply OCR to it +filename = "{}.png".format(os.getpid()) +cv2.imwrite(filename, gray) + +# load the image as a PIL/Pillow image, apply OCR, and then delete +# the temporary file +text = pytesseract.image_to_string(Image.open(filename)) +os.remove(filename) +print(text) + +# show the output images +# cv2.imshow("Image", image) +cv2.imshow("Output", gray) +cv2.waitKey(0) diff --git a/wills/stanford_ner.py b/wills/stanford_ner.py new file mode 100644 index 0000000..bff3fc1 --- /dev/null +++ b/wills/stanford_ner.py @@ -0,0 +1,51 @@ +from PIL import Image +import pytesseract +import argparse +import cv2 +import os +import csv +from nltk.tag import StanfordNERTagger +from nltk.tokenize import word_tokenize + +ocr = [] + +ap = argparse.ArgumentParser() +ap.add_argument("-i", "--image", required=True, + help="path to input image to be OCR'd") +args = vars(ap.parse_args()) + +reader = csv.reader(open('data/gold/extracted_wills.csv')) +for row in reader: + filename = args["image"].split('/')[-1].split('.')[0] + if(row[0] == filename): + + # extract the (x,y) rectangular coordinates for each entry in the probate book + x1 = int(row[2]) + y1 = int(row[3]) + x2 = int(row[4]) + y2 = int(row[5]) + + # load the example image and convert it to grayscale and crop + image = cv2.imread(args["image"]) + image = image[y1:y2, x1:x2] + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + # write the grayscale image to disk as a temporary file so we can + # apply OCR to it + filename1 = "{}.png".format(os.getpid()) + cv2.imwrite(filename1, gray) + + # load the image as a PIL/Pillow image, apply OCR, and then delete + # the temporary file + text = pytesseract.image_to_string(Image.open(filename1)) + print(text) + os.remove(filename1) + + # apply stanford entity recognizer to the data extracted by ocr + + st = StanfordNERTagger('/usr/bin/stanford-ner-2018-02-27/classifiers/english.muc.7class.distsim.crf.ser.gz', '/usr/bin/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8') + + tokenized_text = word_tokenize(text) + classified_text = st.tag(tokenized_text) + + print(classified_text) From bd5a0ee471d9fef5e84944d112a4ea5101d76824 Mon Sep 17 00:00:00 2001 From: Saumya Shah Date: Thu, 5 Apr 2018 01:50:20 +0530 Subject: [PATCH 2/2] Fixes in README --- wills/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/wills/README.md b/wills/README.md index 0ac1cab..4d8a868 100644 --- a/wills/README.md +++ b/wills/README.md @@ -1,15 +1,15 @@ ## Prerequisites and how to run the project ### Prerequisites -To execute the project, following libraries would be required +To execute the project, following libraries would be required: -- [Tesseract] (https://github.com/tesseract-ocr/tesseract) - You can download it using ```sudo apt-get install tesseract```. To run the tesseract application on Python, another library needs to be installed using ```pip``` called ```pytesseract```. Simply, ```pip3 install pytesseract```. +- [Tesseract](https://github.com/tesseract-ocr/tesseract) - You can download it using ```sudo apt-get install tesseract```. To run the tesseract application on Python, another library needs to be installed using ```pip``` called ```pytesseract```. Simply, ```pip3 install pytesseract```. -- [Pillow] (https://pillow.readthedocs.io/en/5.1.x/) - To download Python's Imaging Library, you can use ```pip3 install pillow``` +- [Pillow](https://pillow.readthedocs.io/en/5.1.x/) - To download Python's Imaging Library, you can use ```pip3 install pillow``` -- [Tkinter] (https://docs.python.org/3/library/tkinter.html) - It a Python binder to the Tk GUI. To view the results of the NER which is viewed as a graph you call install python-tkinter using ```sudo apt-get install python3-tkinter```. +- [Tkinter](https://docs.python.org/3/library/tkinter.html) - It a Python binder to the Tk GUI. To view the results of the NER which is viewed as a graph you call install python-tkinter using ```sudo apt-get install python3-tkinter```. -- [Natural Language Toolkit (NLTK)] (http://www.nltk.org/) - To run the Named Entity Recognition, you need to make use of Python's Standard Library for Natural Language Processing. You can install it by simply, ```pip3 install nltk```. +- [Natural Language Toolkit (NLTK)](http://www.nltk.org/) - To run the Named Entity Recognition, you need to make use of Python's Standard Library for Natural Language Processing. You can install it by simply, ```pip3 install nltk```. - You would need to download the standard chunkers, taggers for entity recognition namely ```punkt```, ```averaged_perceptron_taggers```, ```maxent_ne_chunker``` and ```words```. For that - run a python shell in your terminal and follow the steps below : ``` python import nltk @@ -18,11 +18,11 @@ To execute the project, following libraries would be required nltk.download('maxent_ne_chunker') nltk.download('averaged_perceptron_taggers') ``` -- Stanford NER Library (https://nlp.stanford.edu/software/CRF-NER.shtml#Download) - You can download the 7 class recognizer from the link and move it to the ```/usr/bin``` folder. Please ensure that you have ```java```, ```jre``` and ```jdk``` installed to run this wrapper library. +- [Stanford NER Library](https://nlp.stanford.edu/software/CRF-NER.shtml#Download) - You can download the 7 class recognizer from the link and move it to the ```/usr/bin``` folder. Please ensure that you have ```java```, ```jre``` and ```jdk``` installed to run this wrapper library. ### Execute and Run To run the NLTK Standard NER Library, in the terminal, type ```python3 nltk_ner.py -i ```. To run the Stanford NER Tagger, in the terminal type ```python3 stanford_ner.py -i ```. -#### As an output, all the will entries of the scanned image is retrieved along with their named entities classified. +#### As an output, all the will entries of the scanned image is retrieved along with their named entities classified.