cp -r /export/ws03_mt/lab .
setenv PATH ~anoop/tools/bin:$PATH; rehash
export PATH=~anoop/tools/bin:$PATH
f: Abraham begat Isaac ; and Isaac begat Jacob ; and Jacob begat Judas and his brethren
e: Abraham was the father of Isaac , and Isaac the father of Jacob , and Jacob the father of Judah and his brothersOther f languages available in the ~/mtrun/corpora directory are French, Spanish, Swedish and Tetun (East Timorese). As shown above, for this lab, we will be using two forms of English as source text and foreign text. This is useful since this will allow you to understand the kinds of decisions and the kinds of errors made by MT systems. Since training is computationally expensive we'll create a subset of our training data (note that system performance will be better with more training data). First, we create the training data for GIZA++:
cd lab/giza-lab/english
head -3000 bible.train.english > run/bible.e
head -3000 bible.train.foreign > run/bible.f
cd run
runGIZA++.pl bible.e bible.f linux
cd ~/lab/giza-lab/english/run
sh do_cairoize .
/home/ws02/anoop/tools/cairo/run_cairo
cd ~/lab/giza-lab/english/run/lm
perl voc.pl < ../../bible.train.english > english.bible.voc
perl ~anoop/tools/bin/runCMUToolkit.pl ../../bible.train.english -d . -v english.bible.voc
cd ~/lab/giza-lab/english/run/decode
LanguageModelFile = YOUR_HOME_DIR/lab/giza-lab/english/run/lm/bible.train.english.binlm
TranslationModelConfigFile = YOUR_HOME_DIR/lab/giza-lab/english/run/linux/tmconfig.cfg
head -100 ../../bible.eval.foreign | isi-decoder.linux --config isi-decoder.config > english.out
cd ~/lab/giza-lab/english/run/eval
perl mteval-v09c.pl -r ref.sgm -s src.sgm -t src1.sgm -b
perl trg2sgml.pl ../decode/english.out > trg.sgm
perl mteval-v09c.pl -r ref.sgm -s src.sgm -t trg.sgm -b
| Chinese Word
(pinyin) |
English
Translation |
| da4 |
big |
| da4jie1 |
avenue |
| wo3 |
I |
| fang4 |
place |
| jie1 |
avenue |
| jie3fang4 |
liberation |
| bu4 |
not |
| liao3jie3 |
understand |
| wang4 |
forget |
| wang4bu4liao3 |
unable to forget |
| fang4da4 |
enlarge |
| na3li3 |
where |
| zai4 |
at |
setenv PATH /export/WS97/riley/tools-99/linux/bin/fsm-3/:$PATH; rehash
export PATH=/export/WS97/riley/tools-99/linux/bin/fsm-3/:$PATH
cd ~/lab/seg-lab
awk 'NF>2 {print $3}' fsm1.txt | sort -u | perl -e ' print "<epsilon>\t0\n"; print "_\t1\n"; while (<>) { chomp; print "$_\t",$.+1,"\n";}' > char.map
fsmcompile -i char.map fsm1.txt > fsm1.fsa
cat fsm1.fsa | /export/skumar/work/ATT/bin.linux_glibc/fsmmknb_fsa -n 5 -i char.map -l -
fsmcompile -t -i char.map -o char.map T1.txt > T1.fst
fsmrandgen -n 1 fsm1.fsa > string.fsa
cat string.fsa | fsmcompose - T1.fst | fsmproject -o | fsmrmepsilon | /export/skumar/work/ATT/bin.linux_glibc/fsmmknb_fsa -n 5 -i char.map -l -
fsmcompile -i char.map string2.txt > string2.fsa
fsmcompose string2.fsa T1.fst | fsmbestpath -n 2 | fsmproject -o | fsmrmepsilon | fsmprint -i char.map
awk 'NF>2 { print $NF}' T2.txt | sort -u | perl -e 'while (<>) { chomp; print "$_\t$.\n";}' > eng.map
fsmcompile -t -i char.map -o eng.map T2.txt > T2.fst
fsmcompose string2.fsa T1.fst | fsmbestpath -n 2 | fsmproject -o | fsmrmepsilon | fsmcompose - T2.fst | fsmprint -i char.map -o eng.map