Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DCTDecode/Encode implementation #54

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions PDFConsole.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,8 @@ def do_decode(self, argv):
offset = 0
size = 0
validTypes = ['variable', 'file', 'raw']
notImplementedFilters = ['ccittfax''ccf', 'jbig2', 'dct', 'jpx']
#remove dct from the list
notImplementedFilters = ['ccittfax''ccf', 'jbig2', 'jpx']
filters = []
args = self.parseArgs(argv)
if args is None:
Expand Down Expand Up @@ -789,7 +790,8 @@ def do_encode(self, argv):
offset = 0
size = 0
validTypes = ['variable', 'file', 'raw']
notImplementedFilters = ['ascii85', 'a85', 'runlength', 'rl', 'jbig2', 'jpx', 'ccittfax', 'ccf', 'dct']
#KK - remove dct from the list to enable encode on test_encode file
notImplementedFilters = ['ascii85', 'a85', 'runlength', 'rl', 'jbig2', 'jpx', 'ccittfax', 'ccf']
filters = []
args = self.parseArgs(argv)
if args is None:
Expand Down
8 changes: 8 additions & 0 deletions PDFCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6872,6 +6872,8 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis

# Getting the content for each section
bodyContent,xrefContent,trailerContent = self.parsePDFSections(content,forceMode,looseMode)
import pdb
pdb.set_trace()
if xrefContent != None:
xrefOffset = bodyOffset + len(bodyContent)
trailerOffset = xrefOffset + len(xrefContent)
Expand Down Expand Up @@ -6929,17 +6931,22 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis
type = ret[1]
pdfObject = pdfIndirectObject.getObject()
if pdfObject != None:

objectType = pdfObject.getType()
if objectType == 'dictionary':
if isFirstBody and not linearizedFound:
if pdfObject.hasElement('/Linearized'):
pdfFile.setLinearized(True)
linearizedFound = True
elif objectType == 'stream' and type == '/XRef':
pdb.set_trace()

xrefObject = pdfIndirectObject
ret = self.createPDFCrossRefSectionFromStream(pdfIndirectObject)
if ret[0] != -1:
xrefStreamSection = ret[1]


else:
if not forceMode:
sys.exit('Error: An error has occurred while parsing an indirect object!!')
Expand All @@ -6962,6 +6969,7 @@ def parse (self, fileName, forceMode = False, looseMode = False, manualAnalysis
ret = body.updateObjects()
if ret[0] == -1:
pdfFile.addError(ret[1])
pdb.set_trace()
pdfFile.addBody(body)
pdfFile.addNumObjects(body.getNumObjects())
pdfFile.addNumStreams(body.getNumStreams())
Expand Down
121 changes: 100 additions & 21 deletions PDFFilters.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#

# Katherine Khusial Final Project CS6963

# peepdf is a tool to analyse and modify PDF files
# http://peepdf.eternal-todo.com
# By Jose Miguel Esparza <jesparza AT eternal-todo.com>
Expand Down Expand Up @@ -59,6 +61,9 @@
import sys, zlib, lzw, struct
from PDFUtils import getNumsFromBytes, getBytesFromBits, getBitsFromNum
from ccitt import CCITTFax
import cv2 #KK added for DCT
import numpy as np #KK added for DCT
import StringIO #KK added for DCT


def decodeStream(stream, filter, parameters={}):
Expand All @@ -70,6 +75,9 @@ def decodeStream(stream, filter, parameters={}):
@param parameters: List of PDFObjects containing the parameters for the filter
@return: A tuple (status,statusContent), where statusContent is the decoded stream in case status = 0 or an error in case status = -1
'''

#print"Decoding Filters!"
#print filter
if filter == '/ASCIIHexDecode' or filter == '/AHx':
ret = asciiHexDecode(stream)
elif filter == '/ASCII85Decode' or filter == '/A85':
Expand All @@ -85,7 +93,7 @@ def decodeStream(stream, filter, parameters={}):
elif filter == '/JBIG2Decode':
ret = jbig2Decode(stream, parameters)
elif filter == '/DCTDecode' or filter == '/DCT':
ret = dctDecode(stream, parameters)
ret = dctDecode(stream, parameters)
elif filter == '/JPXDecode':
ret = jpxDecode(stream)
elif filter == '/Crypt':
Expand All @@ -104,6 +112,8 @@ def encodeStream(stream, filter, parameters={}):
@param parameters: List of PDFObjects containing the parameters for the filter
@return: A tuple (status,statusContent), where statusContent is the encoded stream in case status = 0 or an error in case status = -1
'''
#print "Encoding Filters"
#print filter
if filter == '/ASCIIHexDecode':
ret = asciiHexEncode(stream)
elif filter == '/ASCII85Decode':
Expand Down Expand Up @@ -786,40 +796,109 @@ def decrypt(stream, parameters):
else:
#TODO: algorithm is cryptFilterName, specified in the /CF dictionary
return (-1, 'Decrypt not supported yet')


'''
def dctDecode(stream, parameters):
'''
Method to decode streams using a DCT technique based on the JPEG standard (NOT IMPLEMENTED YET)

@param stream: A PDF stream
@return: A tuple (status,statusContent), where statusContent is the decoded PDF stream in case status = 0 or an error in case status = -1
'''
decodedStream = ''

decodedStream = '' #create variable
try:
from PIL import Image
import StringIO
except:
return (-1, 'Python Imaging Library (PIL) not installed')
# Quick implementation, assuming the library can detect the parameters

# Quick implementation, assuming the library can detect the parameters
try:
#StringIO to read stream as string and open as image
im = Image.open(StringIO.StringIO(stream))
#print "Printing im"
#print im
#f = open('im.jpg', 'w' )
#f.write(im)
#f.close()
decodedStream = im.tostring()
print "Printing Decoded Stream!"
print decodedStream
#f = open('decodedStream.jpg', 'w' )
#f.write(decodedStream)
#f.close()
#neither file opens as an image/decoded image...how is this useful for user?
return (0, decodedStream)
except:
return (-1, 'Error decompresing image data')
'''

#Update DCTDecoded: Input steam to array output decoded image

def dctEncode(stream, parameters):
'''
Method to encode streams using a DCT technique based on the JPEG standard (NOT IMPLEMENTED YET)
#Read image from memory
def opencv_image_from_stringio(img_stream, cv2_img_flag=0):
img_stream.seek(0) #start at beginning of stream
#read stream and convert to array
img_array = np.asarray(bytearray(img_stream.read()), dtype=np.uint8)
#print "Printing Image Array!"
#print img_array
return cv2.imdecode(img_array, cv2_img_flag)
#Source: http://stackoverflow.com/questions/13329445/how-to-read-image-from-in-memory-buffer-stringio-or-from-url-with-opencv-pytho

def dctDecode(stream, parameters):

@param stream: A PDF stream
@return: A tuple (status,statusContent), where statusContent is the encoded PDF stream in case status = 0 or an error in case status = -1
'''
encodedStream = ''
return (-1, 'DctEncode not supported yet')
try:
import tempfile
tf = tempfile.NamedTemporaryFile()
tmp_filename = tf.name

#read stream from memory using StringIO
imgString = StringIO.StringIO(stream)
#run opencv_image_from_stringio function on imgString and save to img1
img1 = opencv_image_from_stringio(imgString)
#write to output to file
test = open('test_encode', 'w')
test.write(img1)
test.close()

if img1 != None:
#print "Printing tmp_filename and img1!!!"
#print tmp_filename, img1
try:
#save decoded image in temp folder
cv2.cv.SaveImage('%s.jpg' %tmp_filename, cv2.cv.fromarray(img1))
#print cv2.cv.fromarray(img1)
except Exception, e:
print e
return (0, cv2.cv.fromarray(imgString))
except:
return (-1, 'Error decompresing image data')

#Read image from memory
def encode_opencv_image_from_stringio(img_stream):
img_stream.seek(0) #start at the beginning on the stream
#reading the stream and converting it into an array
img_array = np.asarray(bytearray(img_stream.read()), dtype=np.uint8)
#print "Printing Encode Array"
#print img_array
img_array2 = np.reshape(img_array, (-1, 2))
tiny = cv2.resize(img_array2, (0,0), fx=0.5, fy=0.5)
return cv2.imencode('.jpg', tiny)


def dctEncode(stream, parameters):
try:
imgString = StringIO.StringIO(stream)
#return the value and assign to img_buff
retval, img_buff = encode_opencv_image_from_stringio(imgString)

fh_buff = open('test_encode_buf', 'w')
fh_buff.write(img_buff)
fh_buff.close()

imgString = StringIO.StringIO(img_buff)
imgString_out = imgString.getvalue()

fh_stringio = open('test_encode_stringio', 'w')
fh_stringio.write(imgString_out)
fh_stringio.close()
return (0, imgString_out)
except Exception, e:
return (-1, 'Error encoding image data : %s' %e)

def jbig2Decode(stream, parameters):
'''
Expand Down Expand Up @@ -862,4 +941,4 @@ def jpxEncode(stream):
@return: A tuple (status,statusContent), where statusContent is the encoded PDF stream in case status = 0 or an error in case status = -1
'''
encodedStream = ''
return (-1, 'JpxEncode not supported yet')
return (-1, 'JpxEncode not supported yet')
6 changes: 4 additions & 2 deletions peepdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,8 @@ def getPeepJSON(statsDict, version, revision):
sys.exit('Error: The script file "' + options.scriptFile + '" does not exist!!')

if fileName is not None:
import pdb
pdb.set_trace()
pdfParser = PDFParser()
ret, pdf = pdfParser.parse(fileName, options.isForceMode, options.isLooseMode, options.isManualAnalysis)
if options.checkOnVT:
Expand Down Expand Up @@ -714,8 +716,8 @@ def getPeepJSON(statsDict, version, revision):
console.cmdloop()
except KeyboardInterrupt as e:
sys.exit()
except:
errorMessage = '*** Error: Exception not handled using the interactive console!! Please, report it to the author!!'
except Exception, e:
errorMessage = '*** Error: Exception not handled using the interactive console!! Please, report it to the author!! \n %s' %e
print errorColor + errorMessage + resetColor + newLine
traceback.print_exc(file=open(errorsFile, 'a'))
except Exception as e:
Expand Down