/* Vergelijk /net/corpora/DutchSemCor/dsc20120708-HUMAN.log met bestanden in /net/corpora/LassyDevelop/Sonar/SONAR500/DATA om te kijken of IDs overeen komen. Fouten en verschillen worden opgeslagen in SonarCheck.errors.txt */ package main import ( "bytes" "encoding/csv" "encoding/xml" "fmt" "github.com/pebbe/util" "html" "io/ioutil" "os" "sort" "strings" ) var ( prefix = "/net/corpora/LassyDevelop/Sonar/SONAR500/DATA" logfile = "/net/corpora/DutchSemCor/dsc20120708-HUMAN.log" ) // Types voor XML type Lemma struct { Class string `xml:"class,attr"` } type Word struct { XMLName xml.Name `xml:"w"` ID string `xml:"id,attr"` T string `xml:"t"` Lemma Lemma `xml:"lemma"` } // Types voor CSV-regels type LineType struct { fields []string } type LinesType []*LineType func (s LinesType) Len() int { return len(s) } func (s LinesType) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s LinesType) Less(i, j int) bool { // Vergelijk op ID return s[i].fields[1] < s[j].fields[1] } func main() { // Mapping van label naar directory dirnames := make(map[string]string) fileinfos, err := ioutil.ReadDir(prefix) util.CheckErr(err) for _, fileinfo := range fileinfos { n := fileinfo.Name() parts := strings.Split(n, "_") if len(parts) > 1 { dirnames[parts[0]] = n } } // CSV-bestand inlezen en sorteren lines := make([]*LineType, 0) fp, err := os.Open(logfile) util.CheckErr(err) rd := csv.NewReader(fp) for { fields, err := rd.Read() if err != nil && err.Error() == "EOF" { break } util.CheckErr(err) if fields[0] == "num" { continue } lines = append(lines, &LineType{fields}) } fp.Close() sort.Sort(LinesType(lines)) fperr, err := os.OpenFile("SonarCheck.errors.txt", os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0644) util.CheckErr(err) defer fperr.Close() // Alle regels uit CSV-bestand controlleren tegen XML-bestanden var words map[string]*Word ok := false curfile := "" nlines := len(lines) for lineno, line := range lines { fmt.Printf(" %d \r", nlines-lineno) fields := line.fields // Verder met zelfde XML-bestand, of een nieuwe openen n := strings.Split(fields[1], ".")[0] n = prefix + "/" + dirnames[n[:strings.LastIndex(n, "-")]] + "/" + n + ".folia.xml" if n != curfile { curfile = n data, err := ioutil.ReadFile(curfile) if err != nil { ok = false fmt.Fprintln(fperr, err) } else { ok = true words = make(map[string]*Word) parts := bytes.SplitAfter(data, []byte("")) parts = parts[:len(parts)-1] // stuk na laatste bevat geen woord for _, p := range parts { part := p[bytes.Index(p, []byte("