package main import ( "bufio" "bytes" "github.com/pebbe/util" "flag" "fmt" "io" "math" "os" "strconv" ) type tokenType int type token struct { t tokenType s string i int64 f float64 } const ( tokenUndefined tokenType = iota // to catch errors, tokenUndefined must be first tokenBlockBegin tokenBlockEnd tokenLineBegin tokenLineEnd tokenLineError tokenFeature tokenFileEnd tokenString tokenInt tokenFloat ) var ( features = make(map[string]float64) opt_d = flag.Bool("d", false, "display continuation messages to standard output") opt_r = flag.Bool("r", false, "if set, only the raw number of overlap/correct/system triples per sentence is reported") opt_u = flag.Float64("u", 0.0, "weight of unknown features") opt_w = flag.String("w", "data.weights", "file containing features and their weights") opt_x = flag.Bool("x", false, "don't read weights from file") ) func main() { var filename, inputfile string var bCorrect, bN, bOverlap, bPen, bSystem int64 var cCorrect, cN, cOverlap, cPen, cSystem int64 var counter, exact int64 var fCorrect, fN, fOverlap, fPen, fSystem int64 var iCorrect, iOverlap, iSystem int64 var lbCorrect, lbN, lbOverlap, lbPen, lbSystem int64 var lcCorrect, lcN, lcOverlap, lcPen, lcSystem int64 var thisN, thisPen int64 var bAv, bAverage, bFscore, bPrecision, bRecall, bScore float64 var cAv, cAverage, cFscore, cPrecision, cRecall, cScore float64 var fAv, fAverage, fFscore, fPrecision, fRecall, fScore float64 var kappa, kappaAv float64 var lbScore, lcScore, lcWeight float64 var thisScore, thisWeight float64 flag.Parse() if flag.NArg() == 0 && !util.IsTerminal(os.Stdin) { inputfile = "" } else if flag.NArg() == 1 { inputfile = flag.Arg(0) } else { fmt.Fprintf(os.Stderr, "\nUsage: %s [args] [filename]\n\nargs with default values are:\n\n", os.Args[0]) flag.PrintDefaults() fmt.Fprintf(os.Stderr, "\nif filename is missing, read from stdin\n\n") return } if !*opt_x { getWeights(*opt_w) } ch := make(chan token, 100) go lexer(inputfile, ch) newSentence := false MainLoop: for { it := <-ch switch it.t { default: fmt.Fprintf(os.Stderr, "\nUnexpected token from lexer: %#v\n\n", it) os.Exit(1) case tokenFileEnd: break MainLoop case tokenBlockBegin: if *opt_d || *opt_r { filename = (<-ch).s } counter++ newSentence = true case tokenBlockEnd: cPen += lcPen cN += lcN cAv += score(lcPen, lcN) bPen += lbPen bN += lbN bAv += score(lbPen, lbN) if score(lcPen, lcN) >= score(lbPen, lbN) { exact++ } bOverlap += lbOverlap bCorrect += lbCorrect bSystem += lbSystem cOverlap += lcOverlap cCorrect += lcCorrect cSystem += lcSystem if *opt_d { cScore = score(cPen, cN) fmt.Fprintf(os.Stderr, "\t%s test-score\t%.2f\t(%.2f) (exact: %.2f)\n", filename, lcScore, cScore, float64(exact)/float64(counter)) } if *opt_r { fmt.Printf("%v\t%v\t%v\n", filename, lcPen, lcN) } case tokenLineBegin: iOverlap = (<-ch).i iCorrect = (<-ch).i iSystem = (<-ch).i thisWeight = (<-ch).f thisPen = pen(iOverlap, iCorrect, iSystem) thisN = max(iCorrect, iSystem) thisScore = score(thisPen, thisN) if !newSentence { if thisWeight > lcWeight { lcWeight = thisWeight lcPen = thisPen lcN = thisN lcScore = thisScore lcOverlap = iOverlap lcCorrect = iCorrect lcSystem = iSystem } if thisScore > lbScore { lbScore = thisScore lbPen = thisPen lbN = thisN lbOverlap = iOverlap lbCorrect = iCorrect lbSystem = iSystem } } else { newSentence = false fPen += thisPen fN += thisN fAv += thisScore fOverlap += iOverlap fCorrect += iCorrect fSystem += iSystem lcWeight = thisWeight lcN = thisN lcPen = thisPen lcScore = thisScore lbN = thisN lbPen = thisPen lbScore = thisScore lcOverlap = iOverlap lcCorrect = iCorrect lcSystem = iSystem lbOverlap = iOverlap lbCorrect = iCorrect lbSystem = iSystem } } } fScore = score(fPen, fN) bScore = score(bPen, bN) cScore = score(cPen, cN) if bScore != fScore { kappa = 100.0 * (cScore - fScore) / (bScore - fScore) } else { kappa = math.NaN() } cAverage = cAv / float64(counter) bAverage = bAv / float64(counter) fAverage = fAv / float64(counter) if bAverage != fAverage { kappaAv = 100.0 * (cAverage - fAverage) / (bAverage - fAverage) } else { kappaAv = math.NaN() } if !*opt_r { fmt.Printf("\n") fmt.Printf("exact %.2f\n", float64(exact)/float64(counter)) fmt.Printf("first-score %.2f %.2f\n", fScore, fAverage) fmt.Printf("best-score %.2f %.2f\n", bScore, bAverage) fmt.Printf("test-score %.2f %.2f\n", cScore, cAverage) fmt.Printf("phi-score %.2f %.2f\n", kappa, kappaAv) fmt.Printf("first-p/m: %v %v\n", fPen, fN) fmt.Printf("best-p/m: %v %v\n", bPen, bN) fmt.Printf("test-p/m: %v %v\n", cPen, cN) fmt.Printf("first-av: %v %v\n", fAv, counter) fmt.Printf("best-av: %v %v\n", bAv, counter) fmt.Printf("test-av: %v %v\n", cAv, counter) fmt.Printf("first-overlap %v\n", fOverlap) fmt.Printf("first-correct %v\n", fCorrect) fmt.Printf("first-system %v\n", fSystem) fmt.Printf("best-overlap %v\n", bOverlap) fmt.Printf("best-correct %v\n", bCorrect) fmt.Printf("best-system %v\n", bSystem) fmt.Printf("test-overlap %v\n", cOverlap) fmt.Printf("test-correct %v\n", cCorrect) fmt.Printf("test-system %v\n", cSystem) fPrecision = 100.0 * float64(fOverlap) / float64(fSystem) fRecall = 100.0 * float64(fOverlap) / float64(fCorrect) fFscore = (2.0 * fPrecision * fRecall) / (fPrecision + fRecall) fmt.Printf("first-precision %.2f\n", fPrecision) fmt.Printf("first-recall %.2f\n", fRecall) fmt.Printf("first-fscore %.2f\n", fFscore) bPrecision = 100.0 * float64(bOverlap) / float64(bSystem) bRecall = 100.0 * float64(bOverlap) / float64(bCorrect) bFscore = (2 * bPrecision * bRecall) / (bPrecision + bRecall) fmt.Printf("best-precision %.2f\n", bPrecision) fmt.Printf("best-recall %.2f\n", bRecall) fmt.Printf("best-fscore %.2f\n", bFscore) cPrecision = 100.0 * float64(cOverlap) / float64(cSystem) cRecall = 100.0 * float64(cOverlap) / float64(cCorrect) cFscore = (2.0 * cPrecision * cRecall) / (cPrecision + cRecall) fmt.Printf("test-precision %.2f\n", cPrecision) fmt.Printf("test-recall %.2f\n", cRecall) fmt.Printf("test-fscore %.2f\n", cFscore) } } func max(i, j int64) int64 { if i > j { return i } return j } func pen(ov, corr, sys int64) int64 { return max(corr, sys) - ov } func score(pen, n int64) float64 { if n != 0 { return 100.0 * (1.0 - float64(pen)/float64(n)) } return 100.0 } /* output optimized lexer: ( tokenBlockBegin [ tokenString ] // only if opt_d or opt_r: filename ( tokenLineBegin tokenInt tokenInt tokenInt tokenTokenFloat // processed features ) * tokenBlockEnd ) * tokenFileEnd output simple lexer (not used): ( tokenBlockBegin tokenString // filename ( tokenLineBegin tokenInt tokenInt tokenInt ( tokenFeature tokenFloat tokenString ) * tokenLineEnd ) * tokenBlockEnd ) * tokenFileEnd output lexer with error detection (not used): ( tokenBlockBegin tokenString // filename ( tokenLineBegin tokenInt tokenInt tokenInt ( tokenFeature tokenFloat tokenString ) * ( tokenLineEnd | tokenLineError ) ) * tokenBlockEnd ) * tokenFileEnd */ func lexer(filename string, ch chan<- token) { var fp *os.File if filename == "" { fp = os.Stdin } else { fp1, err := os.Open(filename) if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } fp = fp1 defer fp.Close() } rd := bufio.NewReaderSize(fp, 100000) var blockname string var lineno int64 = 0 var line string inBlock := false breaks := make([]int, 0, 1000) for { bline, isP, err := rd.ReadLine() if err == io.EOF { break } if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } if isP { fmt.Fprintln(os.Stderr, "Line too long") os.Exit(1) } lineno++ line = string(bline) breaks = breaks[0:0] j := 0 var p byte = '#' for i, c := range bline { if c == p { breaks = append(breaks, i) j++ switch { case j%2 == 1 && j > 4: p = '@' case j < 2 || j == 4: p = '#' default: p = '|' } } } if n := len(breaks); n < 5 || n%2 != 0 { fmt.Fprintf(os.Stderr, "Parse failed for line %v: %v\n", lineno, line) continue } if !inBlock || line[:breaks[0]] != blockname { if inBlock { ch <- token{t: tokenBlockEnd} } ch <- token{t: tokenBlockBegin} inBlock = true blockname = line[:breaks[0]] if *opt_d || *opt_r { ch <- token{t: tokenString, s: blockname} } } ch <- token{t: tokenLineBegin} v, _ := strconv.ParseInt(line[breaks[1]+1:breaks[2]], 10, 64) ch <- token{t: tokenInt, i: v} v, _ = strconv.ParseInt(line[breaks[2]+1:breaks[3]], 10, 64) ch <- token{t: tokenInt, i: v} v, _ = strconv.ParseInt(line[breaks[3]+1:breaks[4]], 10, 64) ch <- token{t: tokenInt, i: v} var w float64 = 0.0 var fl float64 breaks = append(breaks, len(line)) for n, i := len(breaks)-1, 4; i < n; i += 2 { // ParseFloat is slow, so try ParseInt first, since most input values look like ints a := line[breaks[i]+1 : breaks[i+1]] ii, e := strconv.ParseInt(a, 10, 64) if e == nil { fl = float64(ii) } else { fl, _ = strconv.ParseFloat(a, 64) } if *opt_x { w += fl * *opt_u } else { ff, ok := features[line[breaks[i+1]+1:breaks[i+2]]] if ok { w += fl * ff } else { w += fl * *opt_u } } } ch <- token{t: tokenFloat, f: w} } if inBlock { ch <- token{t: tokenBlockEnd} } ch <- token{t: tokenFileEnd} close(ch) } func getWeights(filename string) { fp, err := os.Open(filename) if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } defer fp.Close() rd := bufio.NewReaderSize(fp, 100000) splitBar := []byte{'|'} for { line, isP, err := rd.ReadLine() if err == io.EOF { break } if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(1) } if isP { fmt.Fprintln(os.Stderr, "Line too long") os.Exit(1) } items := bytes.Split(line, splitBar) if len(items) == 2 { s := string(items[0]) _, ok := features[s] if !ok { f, _ := strconv.ParseFloat(string(items[1]), 64) features[s] = f } } } }