/*

Voor de verwerking van /net/corpora/DutchSemCor/dsc20120708-HUMAN.log

Voegt naam van bronbestand, ID van de zin, en de zin zelf toe.

Syntax:

    ./logmod invoer uitvoer

Typisch gebruik:

    nohup ./logmod /net/corpora/DutchSemCor/dsc20120708-HUMAN.log logmod.results.csv 2> logmod.results.log &

De eerste keer duurt dit lang (ruim een halve dag)

Een tweede keer (na aanpassing van programma, of toevoeging van bronbestanden) kun je de uitvoer van de
eerste keer gebruiken als invoer. Het gaat nu veel sneller (minuten).

    ./logmod logmod.results.csv logmod.results.csv2 2> logmod.results.log


*/

package main

import (
	"bytes"
	"code.google.com/p/go-charset/charset"
	_ "code.google.com/p/go-charset/data"
	"compress/gzip"
	"encoding/csv"
	"encoding/xml"
	"fmt"
	"github.com/pebbe/util"
	"html"
	"io/ioutil"
	"os"
	"sort"
	"strconv"
	"strings"
)

var (
	prefix = "/net/corpora/LassyDevelop/Sonar/SONAR500/DATA"

	fixit = map[string]string{
		"administratie?":        "administratie’",
		"administratie\302\222": "administratie’",
		"drukke?":               "drukke’",
		"drukke\302\222":        "drukke’",
		"gordel?":               "gordel’",
		"gordel\302\222":        "gordel’",
		"programma?s":           "programma’s",
		"programma\302\222s":    "programma’s",
		"schema?s":              "schema’s",
		"schema\302\222s":       "schema’s",
		"studio?s":              "studio’s",
		"studio\302\222s":       "studio’s",
		"thema?s":               "thema’s",
		"thema\302\222s":        "thema’s",
		"vuile?":                "vuile’",
		"vuile\302\222":         "vuile’",
		"wijze?":                "wijze’",
		"wijze\302\222":         "wijze’",
	}
)

// Types voor XML

type Lemma struct {
	Class string `xml:"class,attr"`
}

type Word struct {
	XMLName xml.Name `xml:"w"`
	ID      string   `xml:"id,attr"`
	T       string   `xml:"t"`
	Lemma   Lemma    `xml:"lemma"`
}

// Types voor CSV-regels

type LineType struct {
	idx      int
	fields   []string
	filename string
	s_id     string
	sentence string
}

type LinesType []*LineType

func (s LinesType) Len() int {
	return len(s)
}

func (s LinesType) Swap(i, j int) {
	s[i], s[j] = s[j], s[i]
}

func (s LinesType) Less(i, j int) bool {
	// Vergelijk op ID
	return s[i].fields[1] < s[j].fields[1]
}

type LinesType2 []*LineType

func (s LinesType2) Len() int {
	return len(s)
}

func (s LinesType2) Swap(i, j int) {
	s[i], s[j] = s[j], s[i]
}

func (s LinesType2) Less(i, j int) bool {
	// Vergelijk op volgnummer
	return s[i].idx < s[j].idx
}

func main() {

	if len(os.Args) != 3 {
		fmt.Printf(`
usage: %s infile outfile 2> logfile

`, os.Args[0])
		return
	}

	var headerline []string

	// Mapping van label naar directory
	dirnames := make(map[string]string)
	fileinfos, err := ioutil.ReadDir(prefix)
	util.CheckErr(err)
	for _, fileinfo := range fileinfos {
		n := fileinfo.Name()
		parts := strings.Split(n, "_")
		if len(parts) > 1 {
			dirnames[parts[0]] = n
		}
	}

	// CSV-bestand inlezen en sorteren
	lines := make([]*LineType, 0)
	fp, err := os.Open(os.Args[1])
	util.CheckErr(err)
	rd := csv.NewReader(fp)
	for lineno := 0; true; lineno++ {
		fields, err := rd.Read()
		if err != nil && err.Error() == "EOF" {
			break
		}
		util.CheckErr(err)
		if fields[0] == "num" {
			headerline = fields
			if len(headerline) < 15 {
				headerline = append(headerline, "filename", "sentence_id", "sentence")
			}
			continue
		}
		lines = append(lines, &LineType{idx: lineno, fields: fields})
	}
	fp.Close()
	sort.Sort(LinesType(lines))

	// Alle regels uit CSV-bestand verwerken
	var words map[string]*Word
	var sentences map[string][]string
	rawlines := make([]string, 0)
	ok := false
	curfile := ""
	filetype := ""
	filename := ""
	nlines := len(lines)
	for lineno, line := range lines {
		fmt.Printf(" %d \r", nlines-lineno)

		fields := line.fields

		if len(fields) == 15 {
			if fields[14] != "" {
				continue
			}
			lines[lineno].fields = lines[lineno].fields[:12]
		}

		// Verder met zelfde bestand, of een nieuwe openen
		n := strings.Split(fields[1], ".")[0]
		if n != curfile {
			curfile = n
			filetype = fields[7]
			ok = false
			switch filetype {
			case "SoNaR":
				filename = "/net/corpora/LassyDevelop/Sonar/SONAR500/DATA/" + dirnames[n[:strings.LastIndex(n, "-")]] + "/" + n + ".folia.xml"
				data, err := ioutil.ReadFile(filename)
				if err != nil {
					fmt.Fprintln(os.Stderr, err)
				} else {
					ok = true
					words = make(map[string]*Word)
					sentences = make(map[string][]string)
					parts := bytes.SplitAfter(data, []byte("</w>"))
					parts = parts[:len(parts)-1] // stuk na laatste </w> bevat geen woord
					for _, p := range parts {
						part := p[bytes.Index(p, []byte("<w ")):]
						word := Word{}
						util.CheckErr(xml.Unmarshal(part, &word))
						words[word.ID] = &word
						sID := word.ID[:strings.Index(word.ID, ".w.")]
						if _, x := sentences[sID]; !x {
							sentences[sID] = make([]string, 0)
						}
						sentences[sID] = append(sentences[sID], word.T)
					}
				}
			case "CGN":
				// 512553,CGN-comp-a_fn000248.s.116.w.5,voordoen,voordoen,,v,,CGN,r_v-9709,Elizabeth,N,2011-06-06 14:09:14
				// -> /net/corpora/CGN_ANN_V2/data/annot/text/plk/comp-a/nl/fn000248.plk.gz
				s := strings.Replace(curfile, "CGN-", "", 1)
				p := strings.Split(s, "_")
				q := strings.SplitN(p[1], ".", 2)
				filename = "/net/corpora/CGN_ANN_V2/data/annot/text/plk/" + p[0] + "/nl/" + q[0] + ".plk.gz"
				fp, err = os.Open(filename)
				if err != nil {
					fmt.Fprintln(os.Stderr, err)
				} else {
					r1, err := gzip.NewReader(fp)
					util.CheckErr(err)
					r, err := charset.NewReader("latin1", r1)
					util.CheckErr(err)
					rd := util.NewLinesReaderFromReader(r)
					idx := ""
					sentences = make(map[string][]string)
					for line := range rd.ReadLines() {
						if strings.HasPrefix(line, "<au") {
							i := strings.Index(line, "id=\"")
							idx = strings.SplitN(line[i+4:], "\"", 2)[0]
							util.CheckErr(err)
							sentences[idx] = make([]string, 0)
						} else {
							sentences[idx] = append(sentences[idx], strings.SplitN(line, "\t", 2)[0])
						}
					}
					fp.Close()
					ok = true
				}
			case "Words":
				// 984091,allwords-WR-P-P-L-0000000003.s.21.w.18,gaan,gaan,,v,,Words,r_v-2849,Piek,N,2012-01-31 13:56:18
				// 984076,allwords-dpc-bmm-001087-nl-sen.s.5.w.28,zwarte,zwart,,a,,Words,r_a-16475,Attila,N,2012-01-31 13:46:19
				// 984097,allwords-wiki138.s.168.w.4,gaan,gaan,,v,,Words,c_545568,Piek,N,2012-01-31 13:56:34
				// -> /net/corpora/LassyDevelop/Suites/WR-P-P-L-0000000003.sents
				// -> /net/corpora/LassyDevelop/Suites/dpc-bmm-001087-nl-sen.sents
				// -> /net/corpora/LassyDevelop/Suites/wiki-138.sents
				s := strings.Replace(curfile, "allwords-", "", 1)
				p := strings.SplitN(s, ".", 2)[0]
				if strings.HasPrefix(p, "wiki") {
					p = "wiki-" + p[4:]
				}
				filename = "/net/corpora/LassyDevelop/Suites/" + p + ".sents"
				rd, err := util.NewLinesReaderFromFile(filename)
				if err != nil {
					fmt.Fprintln(os.Stderr, err)
				} else {
					rawlines = rawlines[0:0]
					for line := range rd.ReadLines() {
						rawlines = append(rawlines, line)
					}
					ok = true
				}
			case "Snippet":
			default:
				panic(fields[1] + ": unknown source type: " + fields[7])
			}
		}

		// Geen bestand voor deze regel geopend
		if !ok {
			fmt.Fprintf(os.Stderr, "%s: file not found (%s / %s)\n", fields[1], fields[2], fields[7])
			continue
		}

		switch filetype {
		case "SoNaR":
			// Doorzoek XML naar label
			w, found := words[fields[1]]
			if found {
				/*
					if fields[1] == "WR-P-P-C-0000000140.p.1318.s.7.w.2" {
						fmt.Println(w.T)
						fmt.Println(fields[2])
						return
					}
				*/
				err := false
				w1 := w.T
				w2 := fields[2]
				if ww, ok := fixit[w1]; ok {
					w1 = ww
				}
				if ww, ok := fixit[w2]; ok {
					w2 = ww
				}
				w1 = strings.ToLower(w1)
				w2 = strings.ToLower(html.UnescapeString(w2))
				if w1 != w2 {
					err = true
					fmt.Fprintf(os.Stderr, "%s: T: %s != %s\n", fields[1], w2, w1)
				}
				if w.Lemma.Class != html.UnescapeString(fields[3]) {
					err = true
					fmt.Fprintf(os.Stderr, "%s: Lemma: %s != %s\n", fields[1], html.UnescapeString(fields[3]), w.Lemma.Class)
				}
				if !err {
					lines[lineno].filename = filename
					sID := fields[1][:strings.Index(fields[1], ".w.")]
					lines[lineno].sentence = strings.Join(sentences[sID], " ")
					lines[lineno].s_id = sID
				}
			} else {
				fmt.Fprintf(os.Stderr, "%s: word not found (%s / %s)\n", fields[1], fields[2], fields[7])
			}
		case "CGN":
			// 512553,CGN-comp-a_fn000248.s.116.w.5,voordoen,voordoen,,v,,CGN,r_v-9709,Elizabeth,N,2011-06-06 14:09:14
			i := strings.Split(fields[1], ".")
			nr, err := strconv.Atoi(i[4])
			util.CheckErr(err)
			found := false
			s, ok := sentences[i[2]]
			if ok {
				if nr < len(s) {
					if strings.ToLower(s[nr-1]) == html.UnescapeString(fields[2]) {
						found = true
						lines[lineno].filename = filename
						lines[lineno].sentence = strings.Join(s, " ")
						lines[lineno].s_id = i[2]
					} else {
						fmt.Fprintf(os.Stderr, "%s: T: %s != %s\n", fields[1], fields[2], s[nr-1])
					}
				}
			}
			if !found {
				fmt.Fprintf(os.Stderr, "%s: word not found (%s / %s)\n", fields[1], fields[2], fields[7])
			}
		case "Words":
			i := strings.Split(fields[1], ".")
			si, err := strconv.Atoi(i[2])
			util.CheckErr(err)
			wi, err := strconv.Atoi(i[4])
			util.CheckErr(err)
			found := false
			if si < len(rawlines) {
				line := strings.SplitN(rawlines[si-1], "|", 2)
				id := line[0]
				wrds := strings.Fields(line[1])
				if wi < len(wrds) {
					if wrds[wi-1] == html.UnescapeString(fields[2]) {
						found = true
						lines[lineno].filename = filename
						lines[lineno].s_id = id
						lines[lineno].sentence = line[1]
					} else {
						fmt.Fprintf(os.Stderr, "%s: T: %s != %s\n", fields[1], fields[2], wrds[wi-1])
					}
				}
			}
			if !found {
				fmt.Fprintf(os.Stderr, "%s: word not found (%s / %s)\n", fields[1], fields[2], fields[7])
			}
		case "Snippet":
			fmt.Fprintf(os.Stderr, "%s: word not found (%s / %s)\n", fields[1], fields[2], fields[7])
		default:
			panic(fields[1] + ": unknown source type: " + fields[7])
		}
	}
	fmt.Print("  \r")

	sort.Sort(LinesType2(lines))
	fp, err = os.Create(os.Args[2])
	util.CheckErr(err)
	defer fp.Close()
	w := csv.NewWriter(fp)
	defer w.Flush()
	if len(headerline) > 0 {
		util.CheckErr(w.Write(headerline))
	}
	for _, line := range lines {
		if len(line.fields) < 15 {
			line.fields = append(line.fields, line.filename, line.s_id, line.sentence)
		}
		util.CheckErr(w.Write(line.fields))
	}
}