package main import ( // "github.com/paulrosania/go-charset/charset" "encoding/xml" "fmt" "github.com/rug-compling/alpinocorpus-go/alpinocorpus" "os" "runtime" "strings" ) func main() { for _, filename := range os.Args[1:] { doFile(filename) } } func doFile(filename string) { reader, err := alpinocorpus.NewReader(filename) defer reader.Close() checkErr(err) entries, err := reader.GetAll() checkErr(err) name := reader.Name() for pair := range entries.KeysValues() { doXML(name, pair.Key, &pair.Value) } } func doXML(filename, dataname string, data *string) { type Node struct { Nodes []Node `xml:"node"` Root string `xml:"root,attr"` Lemma string `xml:"lemma,attr"` Word string `xml:"word,attr"` } type Nodes struct { Nodes []Node `xml:"node"` Sentence string `xml:"sentence"` } nodes := &Nodes{} p := xml.NewDecoder(strings.NewReader(*data)) //p.CharsetReader = charset.NewReader // needed if xml is not in UTF-8 checkErr(p.Decode(nodes)) roots := make([]string, 0, 100) lemmas := make([]string, 0, 100) var f func(node Node) f = func(node Node) { if node.Root != "" && node.Root != node.Word { roots = append(roots, node.Root) } if node.Lemma != "" && node.Lemma != node.Word && node.Lemma != node.Root { lemmas = append(lemmas, node.Lemma) } for _, n := range node.Nodes { f(n) } } for _, n := range nodes.Nodes { f(n) } fmt.Printf("FILE: %v/%v\n%v", filename, dataname, nodes.Sentence) for _, root := range roots { fmt.Print(" ", root) } for _, lemma := range lemmas { fmt.Print(" ", lemma) } fmt.Print("\n\n") } func checkErr(err error) { if err != nil { _, filename, lineno, ok := runtime.Caller(1) if ok { fmt.Fprintf(os.Stderr, "\n\n%v:%v: Error: %v\n\n", filename, lineno, err) } panic(err) } }