hgbook
diff en/autoid.py @ 774:5c9552a4e552
Deep revision of Ch.14.
author | Giulio@puck |
---|---|
date | Tue Aug 04 18:15:35 2009 +0200 (2009-08-04) |
parents | |
children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/en/autoid.py Tue Aug 04 18:15:35 2009 +0200 1.3 @@ -0,0 +1,47 @@ 1.4 +#!/usr/bin/env python 1.5 +# 1.6 +# Add unique ID attributes to para tags. This script should only be 1.7 +# run by one person, since otherwise it introduces the possibility of 1.8 +# chaotic conflicts among tags. 1.9 + 1.10 +import glob, os, re, sys 1.11 + 1.12 +tagged = re.compile('<para[^>]* id="x_([0-9a-f]+)"[^>]*>', re.M) 1.13 +untagged = re.compile('<para>') 1.14 + 1.15 +names = glob.glob('ch*.xml') + glob.glob('app*.xml') 1.16 + 1.17 +# First pass: find the highest-numbered paragraph ID. 1.18 + 1.19 +biggest_id = 0 1.20 +seen = set() 1.21 +errs = 0 1.22 + 1.23 +for name in names: 1.24 + for m in tagged.finditer(open(name).read()): 1.25 + i = int(m.group(1),16) 1.26 + if i in seen: 1.27 + print >> sys.stderr, '%s: duplication of ID %s' % (name, i) 1.28 + errs += 1 1.29 + seen.add(i) 1.30 + if i > biggest_id: 1.31 + biggest_id = i 1.32 + 1.33 +def retag(s): 1.34 + global biggest_id 1.35 + biggest_id += 1 1.36 + return '<para id="x_%x">' % biggest_id 1.37 + 1.38 +# Second pass: add IDs to paragraphs that currently lack them. 1.39 + 1.40 +for name in names: 1.41 + f = open(name).read() 1.42 + f1 = untagged.sub(retag, f) 1.43 + if f1 != f: 1.44 + tmpname = name + '.tmp' 1.45 + fp = open(tmpname, 'w') 1.46 + fp.write(f1) 1.47 + fp.close() 1.48 + os.rename(tmpname, name) 1.49 + 1.50 +sys.exit(errs)