/* Voor de verwerking van /net/corpora/DutchSemCor/dsc20120708-HUMAN.log Voegt naam van bronbestand, ID van de zin, en de zin zelf toe. Syntax: ./logmod invoer uitvoer Typisch gebruik: nohup ./logmod /net/corpora/DutchSemCor/dsc20120708-HUMAN.log logmod.results.csv 2> logmod.results.log & De eerste keer duurt dit lang (ruim een halve dag) Een tweede keer (na aanpassing van programma, of toevoeging van bronbestanden) kun je de uitvoer van de eerste keer gebruiken als invoer. Het gaat nu veel sneller (minuten). ./logmod logmod.results.csv logmod.results.csv2 2> logmod.results.log */ package main import ( "bytes" "code.google.com/p/go-charset/charset" _ "code.google.com/p/go-charset/data" "compress/gzip" "encoding/csv" "encoding/xml" "fmt" "github.com/pebbe/util" "html" "io/ioutil" "os" "sort" "strconv" "strings" ) var ( prefix = "/net/corpora/LassyDevelop/Sonar/SONAR500/DATA" fixit = map[string]string{ "administratie?": "administratie’", "administratie\302\222": "administratie’", "drukke?": "drukke’", "drukke\302\222": "drukke’", "gordel?": "gordel’", "gordel\302\222": "gordel’", "programma?s": "programma’s", "programma\302\222s": "programma’s", "schema?s": "schema’s", "schema\302\222s": "schema’s", "studio?s": "studio’s", "studio\302\222s": "studio’s", "thema?s": "thema’s", "thema\302\222s": "thema’s", "vuile?": "vuile’", "vuile\302\222": "vuile’", "wijze?": "wijze’", "wijze\302\222": "wijze’", } ) // Types voor XML type Lemma struct { Class string `xml:"class,attr"` } type Word struct { XMLName xml.Name `xml:"w"` ID string `xml:"id,attr"` T string `xml:"t"` Lemma Lemma `xml:"lemma"` } // Types voor CSV-regels type LineType struct { idx int fields []string filename string s_id string sentence string } type LinesType []*LineType func (s LinesType) Len() int { return len(s) } func (s LinesType) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s LinesType) Less(i, j int) bool { // Vergelijk op ID return s[i].fields[1] < s[j].fields[1] } type LinesType2 []*LineType func (s LinesType2) Len() int { return len(s) } func (s LinesType2) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s LinesType2) Less(i, j int) bool { // Vergelijk op volgnummer return s[i].idx < s[j].idx } func main() { if len(os.Args) != 3 { fmt.Printf(` usage: %s infile outfile 2> logfile `, os.Args[0]) return } var headerline []string // Mapping van label naar directory dirnames := make(map[string]string) fileinfos, err := ioutil.ReadDir(prefix) util.CheckErr(err) for _, fileinfo := range fileinfos { n := fileinfo.Name() parts := strings.Split(n, "_") if len(parts) > 1 { dirnames[parts[0]] = n } } // CSV-bestand inlezen en sorteren lines := make([]*LineType, 0) fp, err := os.Open(os.Args[1]) util.CheckErr(err) rd := csv.NewReader(fp) for lineno := 0; true; lineno++ { fields, err := rd.Read() if err != nil && err.Error() == "EOF" { break } util.CheckErr(err) if fields[0] == "num" { headerline = fields if len(headerline) < 15 { headerline = append(headerline, "filename", "sentence_id", "sentence") } continue } lines = append(lines, &LineType{idx: lineno, fields: fields}) } fp.Close() sort.Sort(LinesType(lines)) // Alle regels uit CSV-bestand verwerken var words map[string]*Word var sentences map[string][]string rawlines := make([]string, 0) ok := false curfile := "" filetype := "" filename := "" nlines := len(lines) for lineno, line := range lines { fmt.Printf(" %d \r", nlines-lineno) fields := line.fields if len(fields) == 15 { if fields[14] != "" { continue } lines[lineno].fields = lines[lineno].fields[:12] } // Verder met zelfde bestand, of een nieuwe openen n := strings.Split(fields[1], ".")[0] if n != curfile { curfile = n filetype = fields[7] ok = false switch filetype { case "SoNaR": filename = "/net/corpora/LassyDevelop/Sonar/SONAR500/DATA/" + dirnames[n[:strings.LastIndex(n, "-")]] + "/" + n + ".folia.xml" data, err := ioutil.ReadFile(filename) if err != nil { fmt.Fprintln(os.Stderr, err) } else { ok = true words = make(map[string]*Word) sentences = make(map[string][]string) parts := bytes.SplitAfter(data, []byte("")) parts = parts[:len(parts)-1] // stuk na laatste bevat geen woord for _, p := range parts { part := p[bytes.Index(p, []byte(" /net/corpora/CGN_ANN_V2/data/annot/text/plk/comp-a/nl/fn000248.plk.gz s := strings.Replace(curfile, "CGN-", "", 1) p := strings.Split(s, "_") q := strings.SplitN(p[1], ".", 2) filename = "/net/corpora/CGN_ANN_V2/data/annot/text/plk/" + p[0] + "/nl/" + q[0] + ".plk.gz" fp, err = os.Open(filename) if err != nil { fmt.Fprintln(os.Stderr, err) } else { r1, err := gzip.NewReader(fp) util.CheckErr(err) r, err := charset.NewReader("latin1", r1) util.CheckErr(err) rd := util.NewLinesReaderFromReader(r) idx := "" sentences = make(map[string][]string) for line := range rd.ReadLines() { if strings.HasPrefix(line, " /net/corpora/LassyDevelop/Suites/WR-P-P-L-0000000003.sents // -> /net/corpora/LassyDevelop/Suites/dpc-bmm-001087-nl-sen.sents // -> /net/corpora/LassyDevelop/Suites/wiki-138.sents s := strings.Replace(curfile, "allwords-", "", 1) p := strings.SplitN(s, ".", 2)[0] if strings.HasPrefix(p, "wiki") { p = "wiki-" + p[4:] } filename = "/net/corpora/LassyDevelop/Suites/" + p + ".sents" rd, err := util.NewLinesReaderFromFile(filename) if err != nil { fmt.Fprintln(os.Stderr, err) } else { rawlines = rawlines[0:0] for line := range rd.ReadLines() { rawlines = append(rawlines, line) } ok = true } case "Snippet": default: panic(fields[1] + ": unknown source type: " + fields[7]) } } // Geen bestand voor deze regel geopend if !ok { fmt.Fprintf(os.Stderr, "%s: file not found (%s / %s)\n", fields[1], fields[2], fields[7]) continue } switch filetype { case "SoNaR": // Doorzoek XML naar label w, found := words[fields[1]] if found { /* if fields[1] == "WR-P-P-C-0000000140.p.1318.s.7.w.2" { fmt.Println(w.T) fmt.Println(fields[2]) return } */ err := false w1 := w.T w2 := fields[2] if ww, ok := fixit[w1]; ok { w1 = ww } if ww, ok := fixit[w2]; ok { w2 = ww } w1 = strings.ToLower(w1) w2 = strings.ToLower(html.UnescapeString(w2)) if w1 != w2 { err = true fmt.Fprintf(os.Stderr, "%s: T: %s != %s\n", fields[1], w2, w1) } if w.Lemma.Class != html.UnescapeString(fields[3]) { err = true fmt.Fprintf(os.Stderr, "%s: Lemma: %s != %s\n", fields[1], html.UnescapeString(fields[3]), w.Lemma.Class) } if !err { lines[lineno].filename = filename sID := fields[1][:strings.Index(fields[1], ".w.")] lines[lineno].sentence = strings.Join(sentences[sID], " ") lines[lineno].s_id = sID } } else { fmt.Fprintf(os.Stderr, "%s: word not found (%s / %s)\n", fields[1], fields[2], fields[7]) } case "CGN": // 512553,CGN-comp-a_fn000248.s.116.w.5,voordoen,voordoen,,v,,CGN,r_v-9709,Elizabeth,N,2011-06-06 14:09:14 i := strings.Split(fields[1], ".") nr, err := strconv.Atoi(i[4]) util.CheckErr(err) found := false s, ok := sentences[i[2]] if ok { if nr < len(s) { if strings.ToLower(s[nr-1]) == html.UnescapeString(fields[2]) { found = true lines[lineno].filename = filename lines[lineno].sentence = strings.Join(s, " ") lines[lineno].s_id = i[2] } else { fmt.Fprintf(os.Stderr, "%s: T: %s != %s\n", fields[1], fields[2], s[nr-1]) } } } if !found { fmt.Fprintf(os.Stderr, "%s: word not found (%s / %s)\n", fields[1], fields[2], fields[7]) } case "Words": i := strings.Split(fields[1], ".") si, err := strconv.Atoi(i[2]) util.CheckErr(err) wi, err := strconv.Atoi(i[4]) util.CheckErr(err) found := false if si < len(rawlines) { line := strings.SplitN(rawlines[si-1], "|", 2) id := line[0] wrds := strings.Fields(line[1]) if wi < len(wrds) { if wrds[wi-1] == html.UnescapeString(fields[2]) { found = true lines[lineno].filename = filename lines[lineno].s_id = id lines[lineno].sentence = line[1] } else { fmt.Fprintf(os.Stderr, "%s: T: %s != %s\n", fields[1], fields[2], wrds[wi-1]) } } } if !found { fmt.Fprintf(os.Stderr, "%s: word not found (%s / %s)\n", fields[1], fields[2], fields[7]) } case "Snippet": fmt.Fprintf(os.Stderr, "%s: word not found (%s / %s)\n", fields[1], fields[2], fields[7]) default: panic(fields[1] + ": unknown source type: " + fields[7]) } } fmt.Print(" \r") sort.Sort(LinesType2(lines)) fp, err = os.Create(os.Args[2]) util.CheckErr(err) defer fp.Close() w := csv.NewWriter(fp) defer w.Flush() if len(headerline) > 0 { util.CheckErr(w.Write(headerline)) } for _, line := range lines { if len(line.fields) < 15 { line.fields = append(line.fields, line.filename, line.s_id, line.sentence) } util.CheckErr(w.Write(line.fields)) } }