#!/usr/bin/env python3.1

import re, sys
import AlpinoCorpus

for infile in sys.argv[1:]:
    if infile.endswith('.data.dz'):
        outfile = infile[:-8] + '-updated'
        wt = 'COMPACT_CORPUS_WRITER'
    elif infile.endswith('.index'):
        outfile = infile[:-6] + '-updated'
        wt = 'COMPACT_CORPUS_WRITER'
    elif infile.endswith('.dact'):
        outfile = infile[:-5] + '-updated.dact'
        wt = 'DBXML_CORPUS_WRITER'
    else:
        outfile = infile + '-updated.dact'
        wt = 'DBXML_CORPUS_WRITER'
    with AlpinoCorpus.Reader(infile) as r, AlpinoCorpus.Writer(outfile, True, wt) as w:
        for name, xml in r.items():
            name = name.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
            xml = re.sub(b'<alpino_ds[^>]*>', b'<alpino_ds doc_id="' + name.encode('utf-8') + b'" version="1.4">', xml, count=1)
            w.write(name, xml)

                                      
