annotate en/autoid.py @ 791:bf84d6b2281c
Minor changes and translation of code snippets for Ch.7.
author |
Giulio@puck |
date |
Tue Aug 11 23:01:30 2009 +0200 (2009-08-11) |
parents |
|
children |
|
rev |
line source |
bos@584
|
1 #!/usr/bin/env python
|
bos@584
|
2 #
|
bos@584
|
3 # Add unique ID attributes to para tags. This script should only be
|
bos@584
|
4 # run by one person, since otherwise it introduces the possibility of
|
bos@584
|
5 # chaotic conflicts among tags.
|
bos@584
|
6
|
bos@584
|
7 import glob, os, re, sys
|
bos@584
|
8
|
bos@584
|
9 tagged = re.compile('<para[^>]* id="x_([0-9a-f]+)"[^>]*>', re.M)
|
bos@584
|
10 untagged = re.compile('<para>')
|
bos@584
|
11
|
bos@584
|
12 names = glob.glob('ch*.xml') + glob.glob('app*.xml')
|
bos@584
|
13
|
bos@584
|
14 # First pass: find the highest-numbered paragraph ID.
|
bos@584
|
15
|
bos@584
|
16 biggest_id = 0
|
bos@584
|
17 seen = set()
|
bos@584
|
18 errs = 0
|
bos@584
|
19
|
bos@584
|
20 for name in names:
|
bos@584
|
21 for m in tagged.finditer(open(name).read()):
|
bos@584
|
22 i = int(m.group(1),16)
|
bos@584
|
23 if i in seen:
|
bos@584
|
24 print >> sys.stderr, '%s: duplication of ID %s' % (name, i)
|
bos@584
|
25 errs += 1
|
bos@584
|
26 seen.add(i)
|
bos@584
|
27 if i > biggest_id:
|
bos@584
|
28 biggest_id = i
|
bos@584
|
29
|
bos@584
|
30 def retag(s):
|
bos@584
|
31 global biggest_id
|
bos@584
|
32 biggest_id += 1
|
bos@584
|
33 return '<para id="x_%x">' % biggest_id
|
bos@584
|
34
|
bos@584
|
35 # Second pass: add IDs to paragraphs that currently lack them.
|
bos@584
|
36
|
bos@584
|
37 for name in names:
|
bos@584
|
38 f = open(name).read()
|
bos@584
|
39 f1 = untagged.sub(retag, f)
|
bos@584
|
40 if f1 != f:
|
bos@584
|
41 tmpname = name + '.tmp'
|
bos@584
|
42 fp = open(tmpname, 'w')
|
bos@584
|
43 fp.write(f1)
|
bos@584
|
44 fp.close()
|
bos@584
|
45 os.rename(tmpname, name)
|
bos@584
|
46
|
bos@584
|
47 sys.exit(errs)
|