/* */ package main /* #cgo CFLAGS: -I./tokenizer #cgo LDFLAGS: -L./tokenizer -ltokenizer #include #include #include */ import "C" import ( "bufio" "compress/gzip" "encoding/json" "fmt" "github.com/pebbe/libtextcat" "html" "io" "os" "os/exec" "runtime" "strings" "unsafe" ) var ( servers = "127.0.0.1:27017" data = make([]byte, 0, 100000) outfiles = make(map[string]*os.File) ) type stTweetScan struct { Id_str string Created_at string Text string User stUserScan } type stUserScan struct { Lang string } func main() { setLocale() /* required for tokenizer */ tc, e := libtextcat.NewTextcat("/net/aistaff/kleiweg/Twitter/textcat-conf-test.txt") checkErr(e) for uur := 0; uur < 24; uur++ { filename := fmt.Sprintf("/net/corpora/twitter/000RAW/201209/20120905:%02d.out.gz", uur) _, e := os.Stat(filename) if e != nil { continue } f, err := os.Open(filename) checkErr(err) r, err := gzip.NewReader(f) checkErr(err) rd := bufio.NewReaderSize(r, 100000) lineno := 0 for { eof := getline(rd) if eof { r.Close() f.Close() break } lineno += 1 tweetScan := stTweetScan{} err := json.Unmarshal(data, &tweetScan) if err != nil { // fmt.Fprintf(os.Stderr, "%v:%v: %v: %v\n", file.name, lineno, err, string(data)) continue } if tweetScan.Id_str == "" { continue } // Sla niet-Nederlandse tweets over. // Gebruikers die Nederlandse tweets posten hebben gewoonlijk hun // taalvoorkeur op "nl" gezet, of niet ingesteld, en dan is het "en". if tweetScan.User.Lang[:2] != "nl" && tweetScan.User.Lang[:2] != "en" { continue } // sanitise tweetScan.Text = strings.Join(strings.Fields(html.UnescapeString(tweetScan.Text)), " ") // tokenizer text := tweetScan.Text cs := C.CString(text) text2 := C.GoString(C.tokenize(cs)) C.free(unsafe.Pointer(cs)) if C.tok_error != C.TOK_OK { fmt.Fprintf(os.Stderr, "%v:%v: %v: %v\n", filename, lineno, text2, text) text2 = text } // taalrader words := strings.Fields(text2) w := []string{} for _, ww := range words { if !(ww == "RT" || strings.HasPrefix(ww, "@") || strings.HasPrefix(ww, "#") || strings.HasPrefix(ww, "http:") || strings.HasPrefix(ww, "https:")) { w = append(w, ww) } } language := tc.Classify(strings.Join(w, " ")) lang1 := strings.SplitAfter(language, "]")[0] if _, ok := outfiles[lang1]; !ok { outfiles[lang1], err = os.Create("test-" + lang1 + ".out") checkErr(err) } _, err = outfiles[lang1].WriteString(language + "\t" + text + "\n") checkErr(err) } // range line in file } // range uur in dag for outfile := range outfiles { checkErr(outfiles[outfile].Close()) } } func getline(r *bufio.Reader) (eof bool) { data = data[0:0] for { line, isP, err := r.ReadLine() if err == io.EOF { eof = true break } checkErr(err) data = append(data, line...) if !isP { break } } return } func checkErr(err error) { if err != nil { _, filename, lineno, ok := runtime.Caller(1) if ok { fmt.Fprintf(os.Stderr, "%v:%v: %v\n", filename, lineno, err) } panic(err) } } func setLocale() { loc := "" out, err := exec.Command("locale", "locale", "-a").Output() if err != nil { panic(err) } ll := strings.Fields(string(out)) for _, l := range ll { if strings.HasSuffix(strings.ToLower(l), ".utf8") { loc = l break } } if loc == "" { panic("command 'locale -a' returned nothing with utf8") } cs := C.CString(loc) C.setlocale(C.LC_CTYPE, cs) C.free(unsafe.Pointer(cs)) }