#!/local/bin/perl # -*- perl -*- # usage: # prepare-data output-file-prefix # # reads a directory of pairs of files of source and target. It is assumed that # these files are prepared so that two corresponding files have similar file # names and contain line by line corresponding text (e.g. first line in sourcef # file 1 is a translation of first line of target file 1 and so on). # # # Three files are generated: # # 1. Corpus file: in line triples format # first line is no. of occurences of sentence pair. # second line: source sentence (replacing tokens with unique integer ids) # third line: target sentence (replacing tokens with unique integer ids) # 2&3. source and target vocabulary files. Each line in these files is a tuple: # # integer_identifier token_string no_occurences_in_corpus # # $tmp_file = "/tmp/corpus.temp"; $BASE_SOURCE_TOKEN_ID = 10 ; $BASE_TARGET_TOKEN_ID = 10 ; $source_max = $BASE_SOURCE_TOKEN_ID ; $target_max = $BASE_TARGET_TOKEN_ID ; $source_dir = "/nfs/mendels1/fiesta/corpus/czech/rd_data/rd*e.pll"; #$target_dir = "/nfs/mendels1/fiesta/corpus/czech/rd_data/temp/rd*c.pll"; @source_files = `ls $source_dir`; open(OUT,">$tmp_file") || die ("$0: can't write to $tmp_file\n"); foreach $source_file (@source_files) { $target_file = $source_file ; $target_file =~ s/e\.pll/c\.pll/; # this line is corpus specific print STDERR " Reading sentence pairs from $source_file & $target_file\n"; open(SOURCE,"$source_file") || die("$0: can't read $source_file"); open(TARGET,"$target_file") || die("$0: can't read $target_file"); while(($source = ) && ($target = )){ $source =~ tr/ //s; $source =~ s/^\s*(.+)\s*\n/$1/; $target =~ tr/ //s; $target =~ s/^\s*(.+)\s*\n/$1/; @words = split(/ /,$source) ; foreach $w (@words){ if (!defined($source_voc_id{$w})){ $source_voc_id{$w} = $source_max++; } $source_voc_freq{$w}++; print OUT "$source_voc_id{$w} " ; } print OUT " "; @words = split(/ /,$target); foreach $w (@words){ if (!defined($target_voc_id{$w})){ $target_voc_id{$w} = $target_max++; } $target_voc_freq{$w}++; print OUT "$target_voc_id{$w} " ; } print OUT "\n"; } close(TARGET); close(SOURCE); } close(OUT); `sort $tmp_file | uniq -c > $tmp_file.uniq`; open(IN,"$tmp_file.uniq") || die("$0: can't read $tmp_file.uniq \n"); open(OUT,">$ARGV[0].corpus") || die("$0: can't create processed corpus file: $ARGV[0].corpus"); while(){ if(m/^\s*([0-9]+)\s+([0-9]+(\s*[0-9]+)*)\s*\\s*([0-9]+(\s*[0-9]+)*)\s*\n/){ print OUT "$1\n$2\n$4\n"; } else{ print STDERR "CAn't recognize line format: $_"; } } close(OUT); close(IN); open(OUT,">$ARGV[0].source.vcb") || die("$0: can't create source vocabulary file: $ARGV[0].source.vcb"); foreach $w (keys %source_voc_id){ printf OUT "%d %s %d\n",$source_voc_id{$w},$w,$source_voc_freq{$w}; } close(OUT); open(OUT,">$ARGV[0].target.vcb") || die("$0: can't create target vocabulary file: $ARGV[0].target.vcb"); foreach $w (keys %target_voc_id){ printf OUT "%d %s %d\n",$target_voc_id{$w},$w,$target_voc_freq{$w}; } close(OUT);