hgbook: en/autoid.py annotate

hgbook

Deep revision of Ch.10.

rev	line source
bos@584	1 #!/usr/bin/env python
bos@584	2 #
bos@584	3 # Add unique ID attributes to para tags. This script should only be
bos@584	4 # run by one person, since otherwise it introduces the possibility of
bos@584	5 # chaotic conflicts among tags.
bos@584	6
bos@584	7 import glob, os, re, sys
bos@584	8
bos@584	9 tagged = re.compile('<para[^>]* id="x_([0-9a-f]+)"[^>]*>', re.M)
bos@584	10 untagged = re.compile('<para>')
bos@584	11
bos@584	12 names = glob.glob('ch.xml') + glob.glob('app.xml')
bos@584	13
bos@584	14 # First pass: find the highest-numbered paragraph ID.
bos@584	15
bos@584	16 biggest_id = 0
bos@584	17 seen = set()
bos@584	18 errs = 0
bos@584	19
bos@584	20 for name in names:
bos@584	21 for m in tagged.finditer(open(name).read()):
bos@584	22 i = int(m.group(1),16)
bos@584	23 if i in seen:
bos@584	24 print >> sys.stderr, '%s: duplication of ID %s' % (name, i)
bos@584	25 errs += 1
bos@584	26 seen.add(i)
bos@584	27 if i > biggest_id:
bos@584	28 biggest_id = i
bos@584	29
bos@584	30 def retag(s):
bos@584	31 global biggest_id
bos@584	32 biggest_id += 1
bos@584	33 return '<para id="x_%x">' % biggest_id
bos@584	34
bos@584	35 # Second pass: add IDs to paragraphs that currently lack them.
bos@584	36
bos@584	37 for name in names:
bos@584	38 f = open(name).read()
bos@584	39 f1 = untagged.sub(retag, f)
bos@584	40 if f1 != f:
bos@584	41 tmpname = name + '.tmp'
bos@584	42 fp = open(tmpname, 'w')
bos@584	43 fp.write(f1)
bos@584	44 fp.close()
bos@584	45 os.rename(tmpname, name)
bos@584	46
bos@584	47 sys.exit(errs)