forked from ieure/sicp
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfix_toc.py
114 lines (80 loc) · 3.27 KB
/
fix_toc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from collections import defaultdict, Counter
from HTMLParser import HTMLParser
import os.path
import re
import sys
from urlparse import urldefrag
from xml.etree import ElementTree
ncx_namespace = 'http://www.daisy.org/z3986/2005/ncx/'
ElementTree.register_namespace('', ncx_namespace)
toc_file = sys.argv[1]
content_dir, _ = os.path.split(toc_file)
found_sections = defaultdict(Counter)
def _tag_name(name):
return '{{{0}}}{1}'.format(ncx_namespace, name)
def _section_id(section_number):
return '__sec_' + section_number
def subsection_source(content_src, subsection_number):
section_number = subsection_match.group(1)
section_id = _section_id(section_number)
new_src = '#'.join([content_src, section_id])
return new_src
class SectionFinder(HTMLParser):
def __init__(self, src_file, section_title):
HTMLParser.__init__(self)
self.last_id = None
self.new_src = None
self.consume_text = False
self.current_text = ''
self.section_title = section_title
self.src_file = src_file
self.section_title_index = found_sections[src_file][section_title]
self.found_sections = 0
def build_src(self, anchor_id):
return '#'.join([self.src_file, anchor_id])
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == 'a' and 'id' in attrs:
self.last_id = attrs['id']
elif self.last_id is not None and tag.startswith('h'):
self.consume_text = True
def handle_endtag(self, tag):
if self.last_id and tag.startswith('h'):
if self.current_text == self.section_title:
if self.found_sections == self.section_title_index:
self.new_src = self.build_src(self.last_id)
found_sections[self.src_file][self.section_title] += 1
self.found_sections += 1
self.last_id = None
self.consume_text = False
self.current_text = ''
def handle_data(self, data):
if self.consume_text:
self.current_text += data
def find_content_source(section_title, content_src):
source_path = os.path.join(content_dir, content_src)
section_finder = SectionFinder(content_src, section_title)
with open(source_path) as source_file:
section_finder.feed(source_file.read())
return section_finder.new_src
doc = ElementTree.parse(toc_file)
for nav_point in doc.iter(_tag_name('navPoint')):
text_node = nav_point.find(_tag_name('navLabel')).find(_tag_name('text'))
content_node = nav_point.find(_tag_name('content'))
if text_node is None or content_node is None:
continue
old_src = content_node.get('src')
content_src, fragment = urldefrag(old_src)
if not fragment:
continue
subsection_match = re.match(r'^([0-9.]+)', text_node.text)
if subsection_match:
new_src = subsection_source(content_src, subsection_match.group(1))
else:
new_src = find_content_source(text_node.text, content_src)
if new_src is None:
print 'Cannot find reference for {0} in {1}'.format(text_node.text, content_src)
continue
print old_src, '->', new_src
content_node.set('src', new_src)
doc.write(toc_file + '-new', xml_declaration=True, encoding='UTF-8')