/* */ package main import ( "encoding/csv" "fmt" "github.com/pebbe/util" "io/ioutil" "os" "sort" "strings" ) var ( prefix = "/net/corpora/dutchsemcor-folia" ) // Types voor CSV-regels type LineType struct { id string } type LinesType []*LineType func (s LinesType) Len() int { return len(s) } func (s LinesType) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s LinesType) Less(i, j int) bool { // Vergelijk op ID return s[i].id < s[j].id } func main() { if len(os.Args) != 2 { fmt.Printf(` usage: %s infile > logfile `, os.Args[0]) return } // Mapping van label naar directory dirnames := make(map[string]string) fileinfos, err := ioutil.ReadDir(prefix) util.CheckErr(err) for _, fileinfo := range fileinfos { n := fileinfo.Name() if n[0] == 'W' { parts := strings.Split(n, "_") if len(parts) > 1 { dirnames[parts[0]] = n } } } // CSV-bestand inlezen en sorteren lines := make([]*LineType, 0) fp, err := os.Open(os.Args[1]) util.CheckErr(err) rd := csv.NewReader(fp) for lineno := 0; true; lineno++ { fields, err := rd.Read() if err != nil && err.Error() == "EOF" { break } util.CheckErr(err) if fields[0] == "num" { continue } lines = append(lines, &LineType{id: fields[1]}) } fp.Close() sort.Sort(LinesType(lines)) // Alle regels uit CSV-bestand testen curfile := "" filename := "" for _, line := range lines { // Verder met zelfde bestand, of een nieuwe openen n := strings.Split(line.id, ".")[0] if n != curfile { curfile = n // Let op: er zijn regels waarvoor curfile begint met CGN en toch fields[7] == SoNaR if curfile[0] == 'W' { filename = "/net/corpora/dutchsemcor-folia/" + dirnames[n[:strings.LastIndex(n, "-")]] + "/" + n + ".folia.xml" } else if strings.HasPrefix(curfile, "CGN") { a := strings.SplitN(n, "_", 2) filename = "/net/corpora/dutchsemcor-folia/" + a[0] + "/" + a[1] + ".folia.xml" } else { filename = "/net/corpora/dutchsemcor-folia/" + n } if _, e := os.Stat(filename); e != nil { fmt.Println(curfile) } } } }