jesparza · kitkatt989 · Apr 23, 2016 · Apr 23, 2016 · Apr 23, 2016 · May 15, 2016
diff --git a/PDFConsole.py b/PDFConsole.py
@@ -387,7 +387,8 @@ def do_decode(self, argv):
         offset = 0
         size = 0
         validTypes = ['variable', 'file', 'raw']
-        notImplementedFilters = ['ccittfax''ccf', 'jbig2', 'dct', 'jpx']
+        #remove dct from the list
+        notImplementedFilters = ['ccittfax''ccf', 'jbig2', 'jpx']
         filters = []
         args = self.parseArgs(argv)
         if args is None:
@@ -789,7 +790,8 @@ def do_encode(self, argv):
         offset = 0
         size = 0
         validTypes = ['variable', 'file', 'raw']
-        notImplementedFilters = ['ascii85', 'a85', 'runlength', 'rl', 'jbig2', 'jpx', 'ccittfax', 'ccf', 'dct']
+        #KK - remove dct from the list to enable encode on test_encode file 
+        notImplementedFilters = ['ascii85', 'a85', 'runlength', 'rl', 'jbig2', 'jpx', 'ccittfax', 'ccf']
         filters = []
         args = self.parseArgs(argv)
         if args is None:

diff --git a/PDFCore.py b/PDFCore.py
@@ -6872,6 +6872,8 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis
 
             # Getting the content for each section
             bodyContent,xrefContent,trailerContent = self.parsePDFSections(content,forceMode,looseMode)
+            import pdb
+            pdb.set_trace()
             if xrefContent != None:    
                 xrefOffset = bodyOffset + len(bodyContent)
                 trailerOffset = xrefOffset + len(xrefContent)
@@ -6929,17 +6931,22 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis
                             type = ret[1]
                             pdfObject = pdfIndirectObject.getObject()
                             if pdfObject != None:
+
                                 objectType = pdfObject.getType()
                                 if objectType == 'dictionary':
                                     if isFirstBody and not linearizedFound:
                                         if pdfObject.hasElement('/Linearized'):
                                             pdfFile.setLinearized(True)
                                             linearizedFound = True
                                 elif objectType == 'stream' and type == '/XRef':
+                                    pdb.set_trace()
+
                                     xrefObject = pdfIndirectObject
                                     ret = self.createPDFCrossRefSectionFromStream(pdfIndirectObject)
                                     if ret[0] != -1:
                                         xrefStreamSection = ret[1]    
+
+
                             else:
                                 if not forceMode:
                                     sys.exit('Error: An error has occurred while parsing an indirect object!!')
@@ -6962,6 +6969,7 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis
             ret = body.updateObjects()
             if ret[0] == -1:
                 pdfFile.addError(ret[1])
+            pdb.set_trace()
             pdfFile.addBody(body)
             pdfFile.addNumObjects(body.getNumObjects())
             pdfFile.addNumStreams(body.getNumStreams())

diff --git a/PDFFilters.py b/PDFFilters.py
@@ -1,4 +1,6 @@
-#
+
+# Katherine Khusial Final Project CS6963
+
 # peepdf is a tool to analyse and modify PDF files
 #    http://peepdf.eternal-todo.com
 #    By Jose Miguel Esparza <jesparza AT eternal-todo.com>
@@ -59,6 +61,9 @@
 import sys, zlib, lzw, struct
 from PDFUtils import getNumsFromBytes, getBytesFromBits, getBitsFromNum
 from ccitt import CCITTFax
+import cv2 #KK added for DCT 
+import numpy as np #KK added for DCT
+import StringIO #KK added for DCT
 
 
 def decodeStream(stream, filter, parameters={}):
@@ -70,6 +75,9 @@ def decodeStream(stream, filter, parameters={}):
         @param parameters: List of PDFObjects containing the parameters for the filter
         @return: A tuple (status,statusContent), where statusContent is the decoded stream in case status = 0 or an error in case status = -1
     '''
+
+    #print"Decoding Filters!"
+    #print filter
     if filter == '/ASCIIHexDecode' or filter == '/AHx':
         ret = asciiHexDecode(stream)
     elif filter == '/ASCII85Decode' or filter == '/A85':
@@ -85,7 +93,7 @@ def decodeStream(stream, filter, parameters={}):
     elif filter == '/JBIG2Decode':
         ret = jbig2Decode(stream, parameters)
     elif filter == '/DCTDecode' or filter == '/DCT':
-        ret = dctDecode(stream, parameters)
+        ret = dctDecode(stream, parameters) 
     elif filter == '/JPXDecode':
         ret = jpxDecode(stream)
     elif filter == '/Crypt':
@@ -104,6 +112,8 @@ def encodeStream(stream, filter, parameters={}):
         @param parameters: List of PDFObjects containing the parameters for the filter
         @return: A tuple (status,statusContent), where statusContent is the encoded stream in case status = 0 or an error in case status = -1
     '''
+    #print "Encoding Filters"
+    #print filter
     if filter == '/ASCIIHexDecode':
         ret = asciiHexEncode(stream)
     elif filter == '/ASCII85Decode':
@@ -786,40 +796,109 @@ def decrypt(stream, parameters):
             else:
                 #TODO: algorithm is cryptFilterName, specified in the /CF dictionary
                 return (-1, 'Decrypt not supported yet')
-
-
+'''
 def dctDecode(stream, parameters):
-    '''
-        Method to decode streams using a DCT technique based on the JPEG standard (NOT IMPLEMENTED YET)
-
-        @param stream: A PDF stream
-        @return: A tuple (status,statusContent), where statusContent is the decoded PDF stream in case status = 0 or an error in case status = -1
-    '''
-    decodedStream = ''
+
+    decodedStream = '' #create variable
     try:
         from PIL import Image
         import StringIO
     except:
         return (-1, 'Python Imaging Library (PIL) not installed')
-    # Quick implementation, assuming the library can detect the parameters
+
+# Quick implementation, assuming the library can detect the parameters
     try:
+        #StringIO to read stream as string and open as image
         im = Image.open(StringIO.StringIO(stream))
+        #print "Printing im"
+        #print im
+        #f = open('im.jpg', 'w' )
+        #f.write(im)
+        #f.close()
         decodedStream = im.tostring()
+        print "Printing Decoded Stream!"
+        print decodedStream
+        #f = open('decodedStream.jpg', 'w' )
+        #f.write(decodedStream)
+        #f.close()
+        #neither file opens as an image/decoded image...how is this useful for user?
         return (0, decodedStream)
     except:
         return (-1, 'Error decompresing image data')
+'''
 
+#Update DCTDecoded: Input steam to array output decoded image
 
-def dctEncode(stream, parameters):
-    '''
-        Method to encode streams using a DCT technique based on the JPEG standard (NOT IMPLEMENTED YET)
+#Read image from memory
+def opencv_image_from_stringio(img_stream, cv2_img_flag=0): 
+    img_stream.seek(0) #start at beginning of stream
+    #read stream and convert to array
+    img_array = np.asarray(bytearray(img_stream.read()), dtype=np.uint8)
+    #print "Printing Image Array!"
+    #print img_array
+    return cv2.imdecode(img_array, cv2_img_flag)
+#Source: http://stackoverflow.com/questions/13329445/how-to-read-image-from-in-memory-buffer-stringio-or-from-url-with-opencv-pytho
+
+def dctDecode(stream, parameters):
 
-        @param stream: A PDF stream
-        @return: A tuple (status,statusContent), where statusContent is the encoded PDF stream in case status = 0 or an error in case status = -1
-    '''
-    encodedStream = ''
-    return (-1, 'DctEncode not supported yet')
+    try:
+        import tempfile
+        tf = tempfile.NamedTemporaryFile()
+        tmp_filename = tf.name
+
+        #read stream from memory using StringIO
+        imgString = StringIO.StringIO(stream)
+        #run opencv_image_from_stringio function on imgString and save to img1
+        img1 = opencv_image_from_stringio(imgString)
+        #write to output to file 
+        test = open('test_encode', 'w')
+        test.write(img1)
+        test.close()
+
+        if img1 != None:
+            #print "Printing tmp_filename and img1!!!"
+            #print tmp_filename, img1
+            try:
+                #save decoded image in temp folder
+                cv2.cv.SaveImage('%s.jpg' %tmp_filename, cv2.cv.fromarray(img1))
+                #print cv2.cv.fromarray(img1)
+            except Exception, e:
+                print e
+        return (0, cv2.cv.fromarray(imgString))
+    except:
+        return (-1, 'Error decompresing image data')
+
+#Read image from memory
+def encode_opencv_image_from_stringio(img_stream):
+    img_stream.seek(0) #start at the beginning on the stream
+    #reading the stream and converting it into an array
+    img_array = np.asarray(bytearray(img_stream.read()), dtype=np.uint8)
+    #print "Printing Encode Array"
+    #print img_array
+    img_array2 = np.reshape(img_array, (-1, 2))
+    tiny = cv2.resize(img_array2, (0,0), fx=0.5, fy=0.5) 
+    return cv2.imencode('.jpg', tiny)
+
+
+def dctEncode(stream, parameters):
+    try:
+        imgString = StringIO.StringIO(stream)
+        #return the value and assign to img_buff
+        retval, img_buff = encode_opencv_image_from_stringio(imgString)
+
+        fh_buff = open('test_encode_buf', 'w')
+        fh_buff.write(img_buff)
+        fh_buff.close()
+
+        imgString = StringIO.StringIO(img_buff)
+        imgString_out = imgString.getvalue()
 
+        fh_stringio = open('test_encode_stringio', 'w')
+        fh_stringio.write(imgString_out)
+        fh_stringio.close()
+        return (0, imgString_out)
+    except Exception, e: 
+        return (-1, 'Error encoding image data : %s' %e)
 
 def jbig2Decode(stream, parameters):
     '''
@@ -862,4 +941,4 @@ def jpxEncode(stream):
         @return: A tuple (status,statusContent), where statusContent is the encoded PDF stream in case status = 0 or an error in case status = -1
     '''
     encodedStream = ''
-    return (-1, 'JpxEncode not supported yet')
+    return (-1, 'JpxEncode not supported yet')
diff --git a/peepdf.py b/peepdf.py
@@ -488,6 +488,8 @@ def getPeepJSON(statsDict, version, revision):
                 sys.exit('Error: The script file "' + options.scriptFile + '" does not exist!!')
 
         if fileName is not None:
+            import pdb
+            pdb.set_trace()
             pdfParser = PDFParser()
             ret, pdf = pdfParser.parse(fileName, options.isForceMode, options.isLooseMode, options.isManualAnalysis)
             if options.checkOnVT:
@@ -714,8 +716,8 @@ def getPeepJSON(statsDict, version, revision):
                             console.cmdloop()
                         except KeyboardInterrupt as e:
                             sys.exit()
-                        except:
-                            errorMessage = '*** Error: Exception not handled using the interactive console!! Please, report it to the author!!'
+                        except Exception, e:
+                            errorMessage = '*** Error: Exception not handled using the interactive console!! Please, report it to the author!! \n %s' %e
                             print errorColor + errorMessage + resetColor + newLine
                             traceback.print_exc(file=open(errorsFile, 'a'))
 except Exception as e: