#!/usr/bin/env python3.1

import io, sys
import xml.etree.ElementTree as ET
import AlpinoCorpus

def doXML(data, archive, name):

    tree = ET.fromstring(data)

    sentence = tree.findtext('./sentence')

    roots = ''
    lemmas = ''
    for node in tree.findall('.//node'):
        root = node.attrib.get('root', '')
        word = node.attrib.get('word', '')
        lemma = node.attrib.get('lemma', '')
        if root and root != word:
            roots += ' ' + root
        if lemma and lemma != root and lemma != word:
            lemmas += ' ' + lemma

    sys.stdout.write('FILE: {0[archive]}/{0[name]}\n{0[sentence]}{0[roots]}{0[lemmas]}\n\n'.format(vars()))

# make sure stdout is in utf-8
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')

if len(sys.argv) == 1:
    sys.stderr.write('\nUsage: {} filename(s)\n\n'.format(sys.argv[0]))
    sys.exit()

for filename in sys.argv[1:]:
    with AlpinoCorpus.Reader(filename) as corpus:
        archive = corpus.name()
        for name, data in corpus.items():
            doXML(data, archive, name)
