/* Zoek meerdere bestanden uit de treebank op in Sonar500 */ package main import ( "encoding/xml" "fmt" "github.com/pebbe/util" "io/ioutil" "labix.org/v2/mgo" "labix.org/v2/mgo/bson" "os" "path" "regexp" "strings" ) var ( servers = "127.0.0.1:27047" ) type Alpino struct { Sentence string `xml:"sentence"` } type Item struct { I string W string } func main() { if len(os.Args) == 1 { fmt.Printf(` Syntax: %s file.xml... bestandsnamen zijn ID van de zin met toevoeging .xml Uitvoer, van goed naar slecht: ID-T exacte overeenkomst van ID en tekst ID-SP exacte overeenkomst van ID; tekst verschilt alleen in spaties TEXT exacte overeenkomst van tekst SPACE tekst verschilt alleen in spaties AZ alleen overeenkomst van letters a t/m z, omgezet in kleine letters XX niet gevonden ERROR database fout `, os.Args[0]) return } noalpha := regexp.MustCompile("[^a-zA-Z]+") session, err := mgo.Dial(servers) util.CheckErr(err) defer session.Close() for _, filename := range os.Args[1:] { fmt.Print(filename) basename := path.Base(filename) idx := strings.LastIndex(basename, "-") if idx < 0 { fmt.Println(" XX") continue } basename = basename[:idx] id := strings.Replace(path.Base(filename), ".xml", "", 1) data, err := ioutil.ReadFile(filename) util.CheckErr(err) v := Alpino{} util.CheckErr(xml.Unmarshal(data, &v)) text := v.Sentence text = noalpha.ReplaceAllString(text, "") text = strings.ToLower(text) kort := text if len(kort) > 100 { kort = kort[:100] } nospace := strings.Replace(v.Sentence, " ", "", -1) collection := session.DB("sonar500").C(basename) idxs, _ := collection.Indexes() if len(idxs) < 3 { fmt.Println(" ERROR no database") continue } found := false query := collection.Find(bson.M{"i": id, "w": v.Sentence}) query = query.Select(bson.M{"i": true}) var item Item iter := query.Iter() for iter.Next(&item) { if !found { fmt.Print(" ID-T") found = true } fmt.Print(" ", item.I) } if !found { query := collection.Find(bson.M{"i": id, "s": text}) query = query.Select(bson.M{"i": true, "w": true}) var item Item iter := query.Iter() for iter.Next(&item) { if strings.Replace(item.W, " ", "", -1) == nospace { if !found { fmt.Print(" ID-SP") found = true } fmt.Print(" ", item.I) } } } if !found { query := collection.Find(bson.M{"k": kort, "w": v.Sentence}) query = query.Select(bson.M{"i": true}) var item Item iter := query.Iter() for iter.Next(&item) { if !found { fmt.Print(" TEXT") found = true } fmt.Print(" ", item.I) } } if !found { query := collection.Find(bson.M{"k": kort, "s": text}) query = query.Select(bson.M{"i": true, "w": true}) var item Item iter := query.Iter() for iter.Next(&item) { if strings.Replace(item.W, " ", "", -1) == nospace { if !found { fmt.Print(" SPACE") found = true } fmt.Print(" ", item.I) } } } if !found { query := collection.Find(bson.M{"k": kort, "s": text}) query = query.Select(bson.M{"i": true}) var item Item iter := query.Iter() for iter.Next(&item) { if !found { fmt.Print(" AZ") found = true } fmt.Print(" ", item.I) } } if !found { fmt.Print(" XX") } fmt.Println() } }