hgbook
annotate en/autoid.py @ 771:f67a90f6c055
Deep revision of Ch.10.
author | Giulio@puck |
---|---|
date | Fri Jul 31 23:27:41 2009 +0200 (2009-07-31) |
parents | |
children |
rev | line source |
---|---|
bos@584 | 1 #!/usr/bin/env python |
bos@584 | 2 # |
bos@584 | 3 # Add unique ID attributes to para tags. This script should only be |
bos@584 | 4 # run by one person, since otherwise it introduces the possibility of |
bos@584 | 5 # chaotic conflicts among tags. |
bos@584 | 6 |
bos@584 | 7 import glob, os, re, sys |
bos@584 | 8 |
bos@584 | 9 tagged = re.compile('<para[^>]* id="x_([0-9a-f]+)"[^>]*>', re.M) |
bos@584 | 10 untagged = re.compile('<para>') |
bos@584 | 11 |
bos@584 | 12 names = glob.glob('ch*.xml') + glob.glob('app*.xml') |
bos@584 | 13 |
bos@584 | 14 # First pass: find the highest-numbered paragraph ID. |
bos@584 | 15 |
bos@584 | 16 biggest_id = 0 |
bos@584 | 17 seen = set() |
bos@584 | 18 errs = 0 |
bos@584 | 19 |
bos@584 | 20 for name in names: |
bos@584 | 21 for m in tagged.finditer(open(name).read()): |
bos@584 | 22 i = int(m.group(1),16) |
bos@584 | 23 if i in seen: |
bos@584 | 24 print >> sys.stderr, '%s: duplication of ID %s' % (name, i) |
bos@584 | 25 errs += 1 |
bos@584 | 26 seen.add(i) |
bos@584 | 27 if i > biggest_id: |
bos@584 | 28 biggest_id = i |
bos@584 | 29 |
bos@584 | 30 def retag(s): |
bos@584 | 31 global biggest_id |
bos@584 | 32 biggest_id += 1 |
bos@584 | 33 return '<para id="x_%x">' % biggest_id |
bos@584 | 34 |
bos@584 | 35 # Second pass: add IDs to paragraphs that currently lack them. |
bos@584 | 36 |
bos@584 | 37 for name in names: |
bos@584 | 38 f = open(name).read() |
bos@584 | 39 f1 = untagged.sub(retag, f) |
bos@584 | 40 if f1 != f: |
bos@584 | 41 tmpname = name + '.tmp' |
bos@584 | 42 fp = open(tmpname, 'w') |
bos@584 | 43 fp.write(f1) |
bos@584 | 44 fp.close() |
bos@584 | 45 os.rename(tmpname, name) |
bos@584 | 46 |
bos@584 | 47 sys.exit(errs) |