diff --git a/Docs_Format_conversion_Scripts/pdf_to_text/README.md b/Docs_Format_conversion_Scripts/pdf_to_text/README.md new file mode 100644 index 00000000..5508e0f0 --- /dev/null +++ b/Docs_Format_conversion_Scripts/pdf_to_text/README.md @@ -0,0 +1,18 @@ +# PDF to Text Converter + +This tool will take a PDF file as input and output the text from the PDF into a text file. The PDF text is also printed in stdout. + +## Requirements +-[PyPDF2](https://pypi.org/project/PyPDF2/) + +## Usage + +### Convert PDF to Text file +```bash +python3 pdf_to_text.py -p -o +``` + +e.g. +```bash +python3 pdf_to_text.py - p /home/username/Documents/sample.pdf -o /home/username/Documents/sample.txt +``` diff --git a/Docs_Format_conversion_Scripts/pdf_to_text/pdf_to_text.py b/Docs_Format_conversion_Scripts/pdf_to_text/pdf_to_text.py new file mode 100644 index 00000000..f2197f21 --- /dev/null +++ b/Docs_Format_conversion_Scripts/pdf_to_text/pdf_to_text.py @@ -0,0 +1,51 @@ +import PyPDF2 +import argparse + +parser = argparse.ArgumentParser( + description = ' A program to convert PDF to Text' + ) +parser.add_argument( + '-p', + '--path', + type=str, + help='The full path of the PDf to convert', + required = True + ) +parser.add_argument( + '-o', + '--output', + type=str, + help='Output text file name. If not specified the text will just be printed out', + required=False +) + +args = parser.parse_args() +path = args.path +text_file = args.output + + +#read example pdf in binary mode +pdfFileObj = open(path,'rb') + +#create reader object +pdfReader = PyPDF2.PdfFileReader(pdfFileObj) + +#get number of pages for the pdf +pages = pdfReader.numPages + +pdfText = [] +#extract text from pdf file and append it to list obj +for page_num in range(pages): + pageObj = pdfReader.getPage(page_num) + #text from pdf and other strings to make it look cleaner on the output + text = pageObj.extractText() + '\n\nPage ' + str(page_num + 1) + '\n' + '*' * 80 + '\n' + pdfText.append(text) + print(text) +if text_file: + #write each obj from the list to text doc + with open(text_file,'w', encoding="utf-8") as f: + for page in pdfText: + f.write(page) +#close pdf object +pdfFileObj.close() + diff --git a/Docs_Format_conversion_Scripts/pdf_to_text/requirements.txt b/Docs_Format_conversion_Scripts/pdf_to_text/requirements.txt new file mode 100644 index 00000000..e9a5c8ea --- /dev/null +++ b/Docs_Format_conversion_Scripts/pdf_to_text/requirements.txt @@ -0,0 +1 @@ +PyPDF2