/* */ package main import ( "bufio" "compress/gzip" "encoding/json" "fmt" "github.com/pebbe/textcat" "html" "io" "os" "regexp" "runtime" "strings" ) var ( servers = "127.0.0.1:27017" data = make([]byte, 0, 100000) outfiles = make(map[string]*os.File) reWord = regexp.MustCompile("(" + // url "[hH][tT][tT][pP][sS]?:([-A-Za-z0-9\\._~:/?#\\[\\]@!$&'\\(\\)\\*\\+,;=]|%[0-9a-fA-f][0-9a-fA-f])*" + "|" + // hashtag "#[\\p{L}0-9]+" + "|" + // mention "@[a-zA-Z0-9_]+" + "|" + // word "\\p{L}-?\\p{L}+(-\\p{L}+)*" + ")") ) type stTweetScan struct { Id_str string Created_at string Text string User stUserScan } type stUserScan struct { Lang string } func main() { tc1 := textcat.NewTextCat() tc1.EnableAllUtf8Languages() tc1.DisableLanguages("af.utf8", "fy.utf8") tc2 := textcat.NewTextCat() tc2.EnableAllUtf8Languages() tc2.DisableLanguages("af.utf8", "fy.utf8") tc2.SetRoot() for uur := 0; uur < 24; uur++ { filename := fmt.Sprintf("/net/corpora/twitter/000RAW/201209/20120905:%02d.out.gz", uur) _, e := os.Stat(filename) if e != nil { continue } f, err := os.Open(filename) checkErr(err) r, err := gzip.NewReader(f) checkErr(err) rd := bufio.NewReaderSize(r, 100000) lineno := 0 for { eof := getline(rd) if eof { r.Close() f.Close() break } lineno += 1 tweetScan := stTweetScan{} err := json.Unmarshal(data, &tweetScan) if err != nil { // fmt.Fprintf(os.Stderr, "%v:%v: %v: %v\n", file.name, lineno, err, string(data)) continue } if tweetScan.Id_str == "" { continue } // Sla niet-Nederlandse tweets over. // Gebruikers die Nederlandse tweets posten hebben gewoonlijk hun // taalvoorkeur op "nl" gezet, of niet ingesteld, en dan is het "en". if tweetScan.User.Lang[:2] != "nl" && tweetScan.User.Lang[:2] != "en" { continue } // sanitise tweetScan.Text = strings.Join(strings.Fields(html.UnescapeString(tweetScan.Text)), " ") // tokenizer text := tweetScan.Text words := strings.Fields(reWord.ReplaceAllString(tweetScan.Text, " $1 ")) // taalrader w := []string{} for _, ww := range words { if !(ww == "RT" || strings.HasPrefix(ww, "@") || strings.HasPrefix(ww, "#") || strings.HasPrefix(ww, "http:") || strings.HasPrefix(ww, "https:")) { w = append(w, ww) } } t := strings.Join(w, " ") lang, err := tc1.Classify(t) language1 := "" if err != nil { language1 = err.Error() } else if len(lang) == 1 { language1 = lang[0] } else { language1 = "MULTI" } lang, err = tc2.Classify(t) language2 := "" if err != nil { language2 = err.Error() } else if len(lang) == 1 { language2 = lang[0] } else { language2 = "MULTI" } if language1 == language2 { continue } lang1 := language1 + "-1" if _, ok := outfiles[lang1]; !ok { outfiles[lang1], err = os.Create("test-" + lang1 + ".out") checkErr(err) } _, err = outfiles[lang1].WriteString(language1 + "\t" + text + "\n") checkErr(err) lang2 := language2 + "-2" if _, ok := outfiles[lang2]; !ok { outfiles[lang2], err = os.Create("test-" + lang2 + ".out") checkErr(err) } _, err = outfiles[lang2].WriteString(language2 + "\t" + text + "\n") checkErr(err) } // range line in file } // range uur in dag for outfile := range outfiles { checkErr(outfiles[outfile].Close()) } } func getline(r *bufio.Reader) (eof bool) { data = data[0:0] for { line, isP, err := r.ReadLine() if err == io.EOF { eof = true break } checkErr(err) data = append(data, line...) if !isP { break } } return } func checkErr(err error) { if err != nil { _, filename, lineno, ok := runtime.Caller(1) if ok { fmt.Fprintf(os.Stderr, "%v:%v: %v\n", filename, lineno, err) } panic(err) } }