#!/apps/bin/perl # corpus_split.prl: A Perl script which splits a corpus into training and # testing sets, based on size parameters given at the # command line. # # Usage: corpus_split.prl corpus.file [--train X] [--test Y] # # corpus.file - the corpus filename # X requests a training set size of X sentences # Y requests a testing set size of Y sentences # (X and/or Y must be given) # # Output will be two files, corpus.file.train and corpus.file.test . # # Noah A. Smith, 8 July 1999; Statistical MT Team at WS '99 # Last edited 9 July 1999, NAS print STDOUT "$0 run by " . getlogin . " on " . scalar(localtime) . "\n"; # gather appropriate information from command line $argc = @ARGV; $trainsize = 0; $testsize = 0; if($argc != 3 && $argc != 5){ die "Usage: $0 corpus.file [--train trainsize] [--test testsize]\n"; } if($ARGV[1] ne "--train" && $ARGV[1] ne "--test"){ die "Usage: $0 corpus.file [--train trainsize] [--test testsize]\n"; } elsif($ARGV[1] eq "--train"){ $trainsize = $ARGV[2]; } else{ $testsize = $ARGV[2]; } if($argc == 5){ if($ARGV[3] ne "--train" && $ARGV[3] ne "--test"){ die "Usage: $0 corpus.file [--train trainsize] [--test testsize]\n"; } elsif($ARGV[3] eq "--train"){ $trainsize = $ARGV[4]; } else{ $testsize = $ARGV[4]; } } $corpus = $ARGV[0]; $train = "$corpus.train"; $test = "$corpus.test"; open(CORPUS, "<$corpus") or die "ERROR: Can't open $corpus!\n"; open(TRAIN, ">$train") or die "ERROR: Can't open $train!\n"; open(TEST, ">$test") or die "ERROR: Can't open $test!\n"; if($trainsize && $testsize){ print STDERR "Training corpus ($train) will be $trainsize sentences.\nTesting corpus ($test) will be $testsize sentences.\nCreating corpora ..."; } elsif($trainsize){ print STDERR "Training corpus ($train) will be $trainsize sentences.\nTesting corpus ($test) will consist of the rest of the sentences.\nCreating corpora ..."; } else{ print STDERR "Testing corpus ($test) will be $testsize sentences.\nTraining corpus ($train) will consist of the rest of the sentences.\nCreating corpora ..."; } $traincurr = 0; $testcurr = 0; $trainunique = 0; $testunique = 0; $flag = 1; # create the corpora while(chomp($linea = ) and chomp($lineb = ) and chomp($linec = ) and $flag){ $linea =~ s/\s+//m; if($trainsize and (($traincurr + $linea) <= $trainsize)){ print TRAIN "$linea\n$lineb\n$linec\n"; $traincurr += $linea; $trainunique++; } elsif($testsize and (($testcurr + $linea) <= $testsize)){ print TEST "$linea\n$lineb\n$linec\n"; $testcurr += $linea; $testunique++; } elsif($trainsize and !$testsize){ print TEST "$linea\n$lineb\n$linec\n"; $testcurr += $linea; $testunique++; } elsif($testsize and !$trainsize){ print TRAIN "$linea\n$lineb\n$linec\n"; $traincurr += $linea; $trainunique++; } if(($trainsize and $traincurr == $trainsize) and ($testsize and $testcurr == $testsize)){ $flag = 0; } } print STDERR " done.\n"; close(CORPUS) or die "ERROR: Can't close $corpus!\n"; close(TRAIN) or die "ERROR: Can't close $train!\n"; close(TEST) or die "ERROR: Can't close $test!\n"; if($trainsize and ($traincurr != $trainsize)){ print STDERR "Unable to create training corpus of $trainsize sentences (only $traincurr available, $trainunique unique).\n"; } else{ print STDERR "Training corpus ($train) successfully created: $traincurr sentences, $trainunique unique.\n"; } if($testsize and ($testcurr != $testsize)){ print STDERR "Unable to create testing corpus of $testsize sentences (only $testcurr available, $testunique unique).\n"; } else{ print STDERR "Testing corpus ($test) successfully created: $testcurr sentences, $testunique unique.\n"; }