/* Voor alle bestanden in /net/corpora/LassyDevelop/Treebank/ : Voeg betekenis uit bestand /net/corpora/DutchSemCor/dsc20120708-HUMAN.log toe als attribuut cornetto2 Resultaten opslaan in: /net/corpora/Sonar500/Treebank/ TODO: alleen atribuut toevoegen als het twee keer, identiek voorkomt ? */ package main import ( "encoding/csv" "fmt" "github.com/pebbe/util" "html" "io/ioutil" "os" "path" "sort" "strconv" "strings" ) var ( prefix = "/net/corpora/LassyDevelop/Treebank" postfix = "/net/corpora/Sonar500/Treebank" //logfile = "/net/corpora/DutchSemCor/dsc20120708-HUMAN.log" logfile = "logmod.results.csv" curfile = "" openfile = false changed = false items []string ) // Types voor CSV-regels type LineType struct { fields []string } type LinesType []*LineType func (s LinesType) Len() int { return len(s) } func (s LinesType) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s LinesType) Less(i, j int) bool { // Vergelijk op ID return s[i].fields[1] < s[j].fields[1] } func main() { // logfile voor fouten fperr, err := os.OpenFile("LassyCornetto.errors.txt", os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0644) util.CheckErr(err) defer fperr.Close() // CSV-bestand inlezen en sorteren lines := make([]*LineType, 0) fp, err := os.Open(logfile) util.CheckErr(err) rd := csv.NewReader(fp) for { fields, err := rd.Read() if err != nil && err.Error() == "EOF" { break } util.CheckErr(err) if fields[0] == "num" { continue } lines = append(lines, &LineType{fields}) } fp.Close() sort.Sort(LinesType(lines)) // Alle regels uit CSV-bestand toepassen op XML-bestanden nlines := len(lines) for lineno, line := range lines { fmt.Printf(" %d \r", nlines-lineno) // Verder met zelfde XML-bestand, of een nieuwe openen i := strings.Index(line.fields[1], ".w") n := line.fields[1][:i] + ".xml" n = n[:strings.Index(n, ".")] + "/" + n n2 := line.fields[13] + ".xml" n2 = n2[:strings.Index(n2, ".")] + "/" + n2 if n != curfile && n2 != curfile { closefile() curfile = n data, err := ioutil.ReadFile(postfix + "/" + curfile) if err != nil { data, err = ioutil.ReadFile(prefix + "/" + curfile) } if err != nil && n2 != "/.xml" { curfile = n2 data, err = ioutil.ReadFile(postfix + "/" + curfile) if err != nil { data, err = ioutil.ReadFile(prefix + "/" + curfile) } } if err == nil { openfile = true items = strings.SplitAfter(string(data), ">") } } // Geen XML-bestand voor deze regel geopend if !openfile { continue } // attribuut toevoegen id := line.fields[1] pos, err := strconv.Atoi(id[strings.LastIndex(id, ".")+1:]) util.CheckErr(err) begin := fmt.Sprintf("begin=\"%d\"", pos-1) end := fmt.Sprintf("end=\"%d\"", pos) found := false for i, item := range items { if strings.Index(item, begin) > 0 && strings.Index(item, end) > 0 && strings.Index(item, "word=\"") > 0 { idx := strings.Index(item, " word=\"") wrd := item[idx+7:] wrd = wrd[:strings.Index(wrd, "\"")] wrd = html.UnescapeString(wrd) if strings.ToLower(wrd) == strings.ToLower(html.UnescapeString(line.fields[2])) { found = true // oude cornetto2 verwijderen idx := strings.Index(item, " cornetto2=\"") if idx > 0 { right := item[idx+12:] idx2 := strings.Index(right, "\"") item = item[:idx] + right[idx2+1:] } items[i] = item[:len(item)-2] + " cornetto2=\"" + line.fields[8] + "\"/>" changed = true } if !found { fmt.Fprintf(fperr, "%s %s != %s\n", id, wrd, html.UnescapeString(line.fields[2])) } break } } if !found { fmt.Fprintf(fperr, "%s not found: %s\n", id, html.UnescapeString(line.fields[2])) } } fmt.Print(" \r") closefile() } func closefile() { if !openfile { changed = false return } if changed { changed = false fullname := postfix + "/" + curfile util.CheckErr(os.MkdirAll(path.Dir(fullname), 0755)) fp, err := os.Create(fullname) util.CheckErr(err) defer fp.Close() fmt.Fprint(fp, strings.Join(items, "")) } openfile = false }