-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpdf_dumper.py
91 lines (79 loc) · 2.7 KB
/
pdf_dumper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
from __future__ import print_function
import hashlib
import Image
import pyPdf
import sys
class PDFDumper(object):
"""Dump resources from a pdf.
Currently, only images are supported.
"""
_image_colorspaces = {
'/DeviceRGB': 'RGB',
'/DeviceGray': 'L',
}
def __init__(self, fp):
self.fp = fp
self.reader = pyPdf.PdfFileReader(fp)
def get_resources(self):
extracted = set()
for page in self.reader.pages:
try:
resources = page['/Resources']
except KeyError:
continue
try:
xobject = resources['/XObject']
except KeyError:
# Are there types other than XObject?
continue
for res in xobject.itervalues():
# In case it's indirect
res = res.getObject()
try:
name = self.resource_name(res)
except (NotImplementedError, AssertionError) as ex:
print('Warning: {}'.format(ex))
continue
if name in extracted:
continue
extracted.add(name)
yield res
def get_image(self, resource):
dimensions = (resource['/Width'], resource['/Height'])
colorspace = resource['/ColorSpace']
if isinstance(colorspace, basestring):
# Basic image
colorspace = self._image_colorspaces[colorspace]
im = Image.new(colorspace, dimensions)
im.frombytes(resource.getData())
return im
# Paletteized image
if colorspace[0] != '/Indexed':
print('Not implemented, image type: {}{}'.format(
colorspace[0], colorspace[1]))
return None
palette = colorspace[3].getObject().getData()
im = Image.new('P', dimensions)
im.putpalette(palette)
im.frombytes(resource.getData())
return im
@staticmethod
def resource_name(resource):
return hashlib.sha1(resource.getData()).hexdigest()
def save_resources(self):
for res in self.get_resources():
if res['/Type'] == '/XObject' and res['/Subtype'] == '/Image':
im = self.get_image(res)
im.save('{}.png'.format(self.resource_name(res)))
continue
print('Unknown Resource: {}/{}'.format(
res['/Type'], res['/Subtype']))
if __name__ == '__main__':
try:
fp = open(sys.argv[1])
except IndexError:
print('Usage: pdf_dumper <filename>')
sys.exit(1)
dumper = PDFDumper(fp)
dumper.save_resources()