### Barbara, August 2010
### bplank (at) gmail.com

OrgPennTreebankWSJDir=/storage/opt_local/opt/corpora/PennTreebank/combined/wsj/
#ptbversion=v2

########### copy original Penn TB wsj ########################
copy-mrg:
	cp -r $(OrgPennTreebankWSJDir) .

########### split PTB files that contain concatenated TIPSTER articles into file a and b ################################
## see: http://www.seas.upenn.edu/~pdtb/genre_files.html part II
## and readme of three map files

files_dupl=00/wsj_0037 01/wsj_0118 01/wsj_0166 02/wsj_0283 05/wsj_0545 05/wsj_0576 08/wsj_0814 09/wsj_0990 11/wsj_1154 11/wsj_1156 12/wsj_1250 14/wsj_1467 17/wsj_1743 18/wsj_1875 20/wsj_2055 21/wsj_2136 23/wsj_2346 24/wsj_2417 18/wsj_1809
#files not found: 03/wsj_0384 03/wsj_0395 21/wsj_2181 22/wsj_2284



split-concatenated:
	for f in $(files_dupl); do echo $$f; mv wsj/$$f.mrg wsj/$$f.mrg.dupl ; done
	head -987 wsj/00/wsj_0037.mrg.dupl >  wsj/00/wsj_0037a.mrg
	tail -n +988 wsj/00/wsj_0037.mrg.dupl > wsj/00/wsj_0037b.mrg
	head -2877 wsj/01/wsj_0118.mrg.dupl > wsj/01/wsj_0118a.mrg
	tail -n +2878 wsj/01/wsj_0118.mrg.dupl > wsj/01/wsj_0118b.mrg
	head -172 wsj/01/wsj_0166.mrg.dupl > wsj/01/wsj_0166a.mrg
	tail -n +173 wsj/01/wsj_0166.mrg.dupl > wsj/01/wsj_0166b.mrg
	head -260 wsj/02/wsj_0283.mrg.dupl > wsj/02/wsj_0283a.mrg
	tail -n +261 wsj/02/wsj_0283.mrg.dupl > wsj/02/wsj_0283b.mrg
	head -40 wsj/05/wsj_0545.mrg.dupl > wsj/05/wsj_0545a.mrg
	tail -n +41 wsj/05/wsj_0545.mrg.dupl > wsj/05/wsj_0545b.mrg
	head -665 wsj/05/wsj_0576.mrg.dupl > wsj/05/wsj_0576a.mrg
	tail -n +666 wsj/05/wsj_0576.mrg.dupl > wsj/05/wsj_0576b.mrg
	head -976 wsj/08/wsj_0814.mrg.dupl > wsj/08/wsj_0814a.mrg
	tail -n +977 wsj/08/wsj_0814.mrg.dupl > wsj/08/wsj_0814b.mrg
	head -935 wsj/09/wsj_0990.mrg.dupl > wsj/09/wsj_0990a.mrg
	tail -n +936 wsj/09/wsj_0990.mrg.dupl > wsj/09/wsj_0990b.mrg
	head -1337 wsj/11/wsj_1154.mrg.dupl > wsj/11/wsj_1154a.mrg
	tail -n +1338 wsj/11/wsj_1154.mrg.dupl > wsj/11/wsj_1154b.mrg
	head -443 wsj/11/wsj_1156.mrg.dupl > wsj/11/wsj_1156a.mrg
	tail -n +444 wsj/11/wsj_1156.mrg.dupl > wsj/11/wsj_1156b.mrg
	head -184 wsj/12/wsj_1250.mrg.dupl > wsj/12/wsj_1250a.mrg
	tail -n +185 wsj/12/wsj_1250.mrg.dupl > wsj/12/wsj_1250b.mrg
	head -177 wsj/14/wsj_1467.mrg.dupl > wsj/14/wsj_1467a.mrg
	tail -n +178 wsj/14/wsj_1467.mrg.dupl > wsj/14/wsj_1467b.mrg
	head -599 wsj/17/wsj_1743.mrg.dupl > wsj/17/wsj_1743a.mrg
	tail -n +600 wsj/17/wsj_1743.mrg.dupl > wsj/17/wsj_1743b.mrg
	head -2319 wsj/18/wsj_1875.mrg.dupl > wsj/18/wsj_1875a.mrg
	tail -n +2320 wsj/18/wsj_1875.mrg.dupl > wsj/18/wsj_1875b.mrg
	head -150 wsj/20/wsj_2055.mrg.dupl > wsj/20/wsj_2055a.mrg
	tail -n +151 wsj/20/wsj_2055.mrg.dupl > wsj/20/wsj_2055b.mrg
	head -67 wsj/21/wsj_2136.mrg.dupl > wsj/21/wsj_2136a.mrg
	tail -n +68 wsj/21/wsj_2136.mrg.dupl > wsj/21/wsj_2136b.mrg
	head -677 wsj/23/wsj_2346.mrg.dupl > wsj/23/wsj_2346a.mrg
	tail -n +678 wsj/23/wsj_2346.mrg.dupl > wsj/23/wsj_2346b.mrg
	head -1352 wsj/24/wsj_2417.mrg.dupl > wsj/24/wsj_2417a.mrg
	tail -n +1353 wsj/24/wsj_2417.mrg.dupl > wsj/24/wsj_2417b.mrg
	head -1181 wsj/18/wsj_1809.mrg.dupl > wsj/18/wsj_1809a.mrg
	tail -n +1182 wsj/18/wsj_1809.mrg.dupl > wsj/18/wsj_1809b.mrg
	echo "remove .dupl files.."
	for f in $(files_dupl); do echo $$f; rm wsj/$$f.mrg.dupl ; done


