diff --git a/PDFConsole.py b/PDFConsole.py old mode 100644 new mode 100755 index 548ac3b..510a680 --- a/PDFConsole.py +++ b/PDFConsole.py @@ -387,7 +387,8 @@ def do_decode(self, argv): offset = 0 size = 0 validTypes = ['variable', 'file', 'raw'] - notImplementedFilters = ['ccittfax''ccf', 'jbig2', 'dct', 'jpx'] + #remove dct from the list + notImplementedFilters = ['ccittfax''ccf', 'jbig2', 'jpx'] filters = [] args = self.parseArgs(argv) if args is None: @@ -789,7 +790,8 @@ def do_encode(self, argv): offset = 0 size = 0 validTypes = ['variable', 'file', 'raw'] - notImplementedFilters = ['ascii85', 'a85', 'runlength', 'rl', 'jbig2', 'jpx', 'ccittfax', 'ccf', 'dct'] + #KK - remove dct from the list to enable encode on test_encode file + notImplementedFilters = ['ascii85', 'a85', 'runlength', 'rl', 'jbig2', 'jpx', 'ccittfax', 'ccf'] filters = [] args = self.parseArgs(argv) if args is None: diff --git a/PDFCore.py b/PDFCore.py index 2267cd7..6f2af98 100644 --- a/PDFCore.py +++ b/PDFCore.py @@ -6872,6 +6872,8 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis # Getting the content for each section bodyContent,xrefContent,trailerContent = self.parsePDFSections(content,forceMode,looseMode) + import pdb + pdb.set_trace() if xrefContent != None: xrefOffset = bodyOffset + len(bodyContent) trailerOffset = xrefOffset + len(xrefContent) @@ -6929,6 +6931,7 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis type = ret[1] pdfObject = pdfIndirectObject.getObject() if pdfObject != None: + objectType = pdfObject.getType() if objectType == 'dictionary': if isFirstBody and not linearizedFound: @@ -6936,10 +6939,14 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis pdfFile.setLinearized(True) linearizedFound = True elif objectType == 'stream' and type == '/XRef': + pdb.set_trace() + xrefObject = pdfIndirectObject ret = self.createPDFCrossRefSectionFromStream(pdfIndirectObject) if ret[0] != -1: xrefStreamSection = ret[1] + + else: if not forceMode: sys.exit('Error: An error has occurred while parsing an indirect object!!') @@ -6962,6 +6969,7 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis ret = body.updateObjects() if ret[0] == -1: pdfFile.addError(ret[1]) + pdb.set_trace() pdfFile.addBody(body) pdfFile.addNumObjects(body.getNumObjects()) pdfFile.addNumStreams(body.getNumStreams()) diff --git a/PDFFilters.py b/PDFFilters.py old mode 100644 new mode 100755 index b862d77..67230d5 --- a/PDFFilters.py +++ b/PDFFilters.py @@ -1,4 +1,6 @@ -# + +# Katherine Khusial Final Project CS6963 + # peepdf is a tool to analyse and modify PDF files # http://peepdf.eternal-todo.com # By Jose Miguel Esparza @@ -59,6 +61,9 @@ import sys, zlib, lzw, struct from PDFUtils import getNumsFromBytes, getBytesFromBits, getBitsFromNum from ccitt import CCITTFax +import cv2 #KK added for DCT +import numpy as np #KK added for DCT +import StringIO #KK added for DCT def decodeStream(stream, filter, parameters={}): @@ -70,6 +75,9 @@ def decodeStream(stream, filter, parameters={}): @param parameters: List of PDFObjects containing the parameters for the filter @return: A tuple (status,statusContent), where statusContent is the decoded stream in case status = 0 or an error in case status = -1 ''' + + #print"Decoding Filters!" + #print filter if filter == '/ASCIIHexDecode' or filter == '/AHx': ret = asciiHexDecode(stream) elif filter == '/ASCII85Decode' or filter == '/A85': @@ -85,7 +93,7 @@ def decodeStream(stream, filter, parameters={}): elif filter == '/JBIG2Decode': ret = jbig2Decode(stream, parameters) elif filter == '/DCTDecode' or filter == '/DCT': - ret = dctDecode(stream, parameters) + ret = dctDecode(stream, parameters) elif filter == '/JPXDecode': ret = jpxDecode(stream) elif filter == '/Crypt': @@ -104,6 +112,8 @@ def encodeStream(stream, filter, parameters={}): @param parameters: List of PDFObjects containing the parameters for the filter @return: A tuple (status,statusContent), where statusContent is the encoded stream in case status = 0 or an error in case status = -1 ''' + #print "Encoding Filters" + #print filter if filter == '/ASCIIHexDecode': ret = asciiHexEncode(stream) elif filter == '/ASCII85Decode': @@ -786,40 +796,109 @@ def decrypt(stream, parameters): else: #TODO: algorithm is cryptFilterName, specified in the /CF dictionary return (-1, 'Decrypt not supported yet') - - +''' def dctDecode(stream, parameters): - ''' - Method to decode streams using a DCT technique based on the JPEG standard (NOT IMPLEMENTED YET) - - @param stream: A PDF stream - @return: A tuple (status,statusContent), where statusContent is the decoded PDF stream in case status = 0 or an error in case status = -1 - ''' - decodedStream = '' + + decodedStream = '' #create variable try: from PIL import Image import StringIO except: return (-1, 'Python Imaging Library (PIL) not installed') - # Quick implementation, assuming the library can detect the parameters + +# Quick implementation, assuming the library can detect the parameters try: + #StringIO to read stream as string and open as image im = Image.open(StringIO.StringIO(stream)) + #print "Printing im" + #print im + #f = open('im.jpg', 'w' ) + #f.write(im) + #f.close() decodedStream = im.tostring() + print "Printing Decoded Stream!" + print decodedStream + #f = open('decodedStream.jpg', 'w' ) + #f.write(decodedStream) + #f.close() + #neither file opens as an image/decoded image...how is this useful for user? return (0, decodedStream) except: return (-1, 'Error decompresing image data') +''' +#Update DCTDecoded: Input steam to array output decoded image -def dctEncode(stream, parameters): - ''' - Method to encode streams using a DCT technique based on the JPEG standard (NOT IMPLEMENTED YET) +#Read image from memory +def opencv_image_from_stringio(img_stream, cv2_img_flag=0): + img_stream.seek(0) #start at beginning of stream + #read stream and convert to array + img_array = np.asarray(bytearray(img_stream.read()), dtype=np.uint8) + #print "Printing Image Array!" + #print img_array + return cv2.imdecode(img_array, cv2_img_flag) +#Source: http://stackoverflow.com/questions/13329445/how-to-read-image-from-in-memory-buffer-stringio-or-from-url-with-opencv-pytho + +def dctDecode(stream, parameters): - @param stream: A PDF stream - @return: A tuple (status,statusContent), where statusContent is the encoded PDF stream in case status = 0 or an error in case status = -1 - ''' - encodedStream = '' - return (-1, 'DctEncode not supported yet') + try: + import tempfile + tf = tempfile.NamedTemporaryFile() + tmp_filename = tf.name + + #read stream from memory using StringIO + imgString = StringIO.StringIO(stream) + #run opencv_image_from_stringio function on imgString and save to img1 + img1 = opencv_image_from_stringio(imgString) + #write to output to file + test = open('test_encode', 'w') + test.write(img1) + test.close() + + if img1 != None: + #print "Printing tmp_filename and img1!!!" + #print tmp_filename, img1 + try: + #save decoded image in temp folder + cv2.cv.SaveImage('%s.jpg' %tmp_filename, cv2.cv.fromarray(img1)) + #print cv2.cv.fromarray(img1) + except Exception, e: + print e + return (0, cv2.cv.fromarray(imgString)) + except: + return (-1, 'Error decompresing image data') + +#Read image from memory +def encode_opencv_image_from_stringio(img_stream): + img_stream.seek(0) #start at the beginning on the stream + #reading the stream and converting it into an array + img_array = np.asarray(bytearray(img_stream.read()), dtype=np.uint8) + #print "Printing Encode Array" + #print img_array + img_array2 = np.reshape(img_array, (-1, 2)) + tiny = cv2.resize(img_array2, (0,0), fx=0.5, fy=0.5) + return cv2.imencode('.jpg', tiny) + + +def dctEncode(stream, parameters): + try: + imgString = StringIO.StringIO(stream) + #return the value and assign to img_buff + retval, img_buff = encode_opencv_image_from_stringio(imgString) + + fh_buff = open('test_encode_buf', 'w') + fh_buff.write(img_buff) + fh_buff.close() + + imgString = StringIO.StringIO(img_buff) + imgString_out = imgString.getvalue() + fh_stringio = open('test_encode_stringio', 'w') + fh_stringio.write(imgString_out) + fh_stringio.close() + return (0, imgString_out) + except Exception, e: + return (-1, 'Error encoding image data : %s' %e) def jbig2Decode(stream, parameters): ''' @@ -862,4 +941,4 @@ def jpxEncode(stream): @return: A tuple (status,statusContent), where statusContent is the encoded PDF stream in case status = 0 or an error in case status = -1 ''' encodedStream = '' - return (-1, 'JpxEncode not supported yet') \ No newline at end of file + return (-1, 'JpxEncode not supported yet') diff --git a/peepdf.py b/peepdf.py index 59c7a69..619e307 100755 --- a/peepdf.py +++ b/peepdf.py @@ -488,6 +488,8 @@ def getPeepJSON(statsDict, version, revision): sys.exit('Error: The script file "' + options.scriptFile + '" does not exist!!') if fileName is not None: + import pdb + pdb.set_trace() pdfParser = PDFParser() ret, pdf = pdfParser.parse(fileName, options.isForceMode, options.isLooseMode, options.isManualAnalysis) if options.checkOnVT: @@ -714,8 +716,8 @@ def getPeepJSON(statsDict, version, revision): console.cmdloop() except KeyboardInterrupt as e: sys.exit() - except: - errorMessage = '*** Error: Exception not handled using the interactive console!! Please, report it to the author!!' + except Exception, e: + errorMessage = '*** Error: Exception not handled using the interactive console!! Please, report it to the author!! \n %s' %e print errorColor + errorMessage + resetColor + newLine traceback.print_exc(file=open(errorsFile, 'a')) except Exception as e: