#!/apps/bin/perl # corpus_analyze.prl: A Perl script which gives an analysis of either a # corpus or a training/test corpus pair. # The report gives the number of tokens, types, types # seen more than once, and number of types and tokens # found in the test corpus but not the training corpus. # It gives analogous information for co-occurences, and # it prints sentence length histograms. # # Usage: corpus_analyze.prl corpus.file [test.corpus.file] # # corpus.file - the training corpus filename if there are 2 arguments, # or any corpus if there is only one # test.corpus.file - the test corpus filename # # The corpus files MUST be in the following format: # # := ... # := # ... # ... # # Output is to STDOUT. # # Noah A. Smith, 7 July 1999; Statistical MT Team at WS '99 # Last edited 8 July 1999, NAS print STDOUT "$0 run by " . getlogin . " on " . scalar(localtime) . "\n"; $argc = @ARGV; if($argc < 1 || $argc > 2){ die "Usage: $0 corpus.file [test.corpus.file]\n"; } $twofiles = ($argc == 2); $corpus = $ARGV[0]; open(CORPUS, "<$corpus") or die "ERROR: Can't open $corpus!\n"; # these values will be used by the sentence length histogram: $maxsentences_s = 0; $maxsentences_t = 0; $testmaxsentences_s = 0; $testmaxsentences_t = 0; $maxsentencelen = 30; # for the source language: %typecounts_s = 0; # hash table holding counts for each type $numtypes_s = 0; # tally of unique types $numtypes2_s = 0; # tally of unique types occuring more than once $numtokens_s = 0; # tally of tokens in the corpus %sentencelens_s = 0; # tally of sentence lengths # for the target language: %typecounts_t = 0; # hash table holding counts for each type $numtypes_t = 0; # tally of unique types $numtypes2_t = 0; # tally of unique types occuring more than once $numtokens_t = 0; # tally of tokens in the corpus %sentencelens_t = 0; # tally of sentence lengths # for the co-occurences: %cooccounts = 0; # double hash table for tally of co-occurences $totalcooc = 0; # tally of co-occurences $uniquecooc = 0; # tally of co-occurence types $uniquecooc2 = 0; # tally of co-occurence types occuring more than once $totalsent = 0; # tally of sentences in the corpus $uniquesent = 0; # tally of unique sentences in the corpus # get token and type counts from the corpus file while(chomp($linea = ) and chomp($lineb = ) and chomp($linec = )){ $coefficient = $linea; $totalsent += $coefficient; $uniquesent++; @sentence_s = split /\s+/, $lineb; $t = ++$sentencelens_s{scalar(@sentence_s)}; if($t > $maxsentences_s){ $maxsentences_s = $t; } foreach $token (@sentence_s){ # get source token counts if($token){ $typecounts_s{$token} += $coefficient; $numtokens_s++; } } @sentence_t = split /\s+/, $linec; $t = ++$sentencelens_t{scalar(@sentence_t)}; if($t > $maxsentences_t){ $maxsentences_t = $t; } foreach $token (@sentence_t){ # get target token counts if($token){ $typecounts_t{$token} += $coefficient; $numtokens_t++; foreach $stoken (@sentence_s){ # get cooccurence counts $cooccounts{$stoken}{$token} += $coefficient if($stoken); } } } } # count types and types which occur twice or more foreach $type (keys %typecounts_s){ $numtypes_s++; $numtypes2_s++ if($typecounts_s{$type} > 1); } foreach $type (keys %typecounts_t){ $numtypes_t++; $numtypes2_t++ if($typecounts_t{$type} > 1); } # count total and unique co-occurences and UCs which occur twice or more foreach $stoken (keys %cooccounts){ foreach $ttoken (keys %{$cooccounts{$stoken}}){ $totalcooc += $cooccounts{$stoken}{$ttoken}; $uniquecooc++; $uniquecooc2++ if($cooccounts{$stoken}{$ttoken} > 1); } } close(CORPUS) or die "ERROR: Can't close $corpus!\n"; # repeat process for the second file if there is one if($twofiles){ $testcorpus = $ARGV[1]; open(TESTCORPUS, "<$testcorpus") or die "ERROR: Can't open $testcorpus!\n"; # for the source language: %testtypecounts_s = 0; # hash table holding counts for each type $testnumtypes_s = 0; # tally of unique types $testnumtypes2_s = 0; # tally of unique types occuring more than once $testnumtokens_s = 0; # tally of tokens in the corpus $numunseentokens_s = 0; # tally of tokens not seen in training corpus $numunseentypes_s = 0; # tally of types not seen in training corpus $numunseentypes2_s = 0; # tally of types not seen in training corpus # and occurring more than once %testsentencelens_s = 0; # tally of sentence lengths # for the target language: %testtypecounts_t = 0; # hash table holding counts for each type $testnumtypes_t = 0; # tally of unique types $testnumtypes2_t = 0; # tally of unique types occuring more than once $testnumtokens_t = 0; # tally of tokens in the corpus $numunseentokens_t = 0; # tally of tokens not seen in training corpus $numunseentypes_t = 0; # tally of types not seen in training corpus $numunseentypes2_t = 0; # tally of types not seen in training corpus # and occurring more than once %testsentencelens_t = 0; # tally of sentence lengths # for the co-occurences: %testcooccounts = 0; # double hash table for tally of co-occurences $testtotalcooc = 0; # tally of co-occurences $testuniquecooc = 0; # tally of co-occurence types $testuniquecooc2 = 0; # tally of co-occurence types occuring # more than once $totalunseencooc = 0; # tally of co-occurences not seen in training # corpus $uniqueunseencooc = 0; # tally of unique co-occurences not seen in # training corpus $uniqueunseencooc2 = 0; # tally of unique co-occurences not seen in # training corpus and occuring more than once $testtotalsent = 0; # tally of sentences in the corpus $testuniquesent = 0; # tally of unique sentences in the corpus # get token counts from the corpus file while(chomp($linea = ) and chomp($lineb = ) and chomp($linec = )){ $coefficient = $linea; $testtotalsent += $coefficient; $testuniquesent++; @sentence_s = split /\s+/, $lineb; $t = ++$testsentencelens_s{scalar(@sentence_s)}; if($t > $testmaxsentences_s){ $testmaxsentences_s = $t; } foreach $token (@sentence_s){ # get source token counts if($token){ $testtypecounts_s{$token} += $coefficient; $testnumtokens_s++; } } @sentence_t = split /\s+/, $linec; $t = ++$testsentencelens_t{scalar(@sentence_t)}; if($t > $testmaxsentences_t){ $testmaxsentences_t = $t; } foreach $token (@sentence_t){ # get target token counts if($token){ $testtypecounts_t{$token} += $coefficient; $testnumtokens_t++; foreach $stoken (@sentence_s){ # get cooccurence counts $testcooccounts{$stoken}{$token} += $coefficient if($stoken); } } } } # count types and types which occur twice or more foreach $type (keys %testtypecounts_s){ $count = $testtypecounts_s{$type}; $testnumtypes_s++; $testnumtypes2_s++ if($count > 1); if($typecounts_s{$type} == 0){ $numunseentokens_s += $count; $numunseentypes_s++; $numunseentypes2_s++ if($count > 1); } } foreach $type (keys %testtypecounts_t){ $count = $testtypecounts_t{$type}; $testnumtypes_t++; $testnumtypes2_t++ if($count > 1); if($typecounts_t{$type} == 0){ $numunseentokens_t += $count; $numunseentypes_t++; $numunseentypes2_t++ if($count > 1); } } # count total and unique co-occurences and UCs which occur twice or more foreach $stoken (keys %testcooccounts){ foreach $ttoken (keys %{$testcooccounts{$stoken}}){ $count = $testcooccounts{$stoken}{$ttoken}; $testtotalcooc += $count; $testuniquecooc++; $testuniquecooc2++ if($count > 1); if($cooccounts{$stoken}{$ttoken} == 0){ $totalunseencooc += $count; $uniqueunseencooc++; $uniqueunseencooc2++ if($count > 1); } } } close(TESTCORPUS) or die "ERROR: Can't close $testcorpus!\n"; } # The rest is just formatted printing to standard out. print STDOUT "Analysis of $corpus"; print STDOUT " and $testcorpus" if($twofiles); print STDOUT ":\n"; print STDOUT ("=" x 80) . "\n"; print STDOUT "CORPUS DATA:\n"; print STDOUT "\t$corpus contains $totalsent sentences ($uniquesent unique).\n"; print STDOUT "\t$testcorpus contains $testtotalsent sentences ($testuniquesent unique).\n" if($twofiles); print STDOUT ("-" x 80) . "\n"; print STDOUT (" " x 35) . " TOKENS TYPES (C > 1)*\n"; print STDOUT ("-" x 80) . "\n"; if($twofiles){ $descr = " (TRAINING)"; } else{ $descr = " "; } print STDOUT "SOURCE:\n"; print STDOUT (" " x 5) . $corpus . $descr . (" " x (19 - length($corpus))) . (" " x (15 - length($numtokens_s))) . $numtokens_s . (" " x (15 - length($numtypes_s))) . $numtypes_s . (" " x (13 - length($numtypes2_s))) . "(" . $numtypes2_s . ")\n"; if($twofiles){ print STDOUT (" " x 5) . $testcorpus . " (TESTING)" . (" " x (20 - length($testcorpus))) . (" " x (15 - length($testnumtokens_s))) . $testnumtokens_s . (" " x (15 - length($testnumtypes_s))) . $testnumtypes_s . (" " x (13 - length($testnumtypes2_s))) . "(" . $testnumtypes2_s . ")\n"; print STDOUT " {TESTING} - {TRAINING} " . (" " x (15 - length($numunseentokens_s))) . $numunseentokens_s . (" " x (15 - length($numunseentypes_s))) . $numunseentypes_s . (" " x (13 - length($numunseentypes2_s))) . "(" . $numunseentypes2_s . ")\n"; } print STDOUT "TARGET:\n"; print STDOUT (" " x 5) . $corpus . $descr . (" " x (19 - length($corpus))) . (" " x (15 - length($numtokens_t))) . $numtokens_t . (" " x (15 - length($numtypes_t))) . $numtypes_t . (" " x (13 - length($numtypes2_t))) . "(" . $numtypes2_t . ")\n"; if($twofiles){ print STDOUT (" " x 5) . $testcorpus . " (TESTING)" . (" " x (20 - length($testcorpus))) . (" " x (15 - length($testnumtokens_t))) . $testnumtokens_t . (" " x (15 - length($testnumtypes_t))) . $testnumtypes_t . (" " x (13 - length($testnumtypes2_t))) . "(" . $testnumtypes2_t . ")\n"; print STDOUT " {TESTING} - {TRAINING} " . (" " x (15 - length($numunseentokens_t))) . $numunseentokens_t . (" " x (15 - length($numunseentypes_t))) . $numunseentypes_t . (" " x (13 - length($numunseentypes2_t))) . "(" . $numunseentypes2_t . ")\n"; } print STDOUT ("-" x 80) . "\n"; print STDOUT "*The number of types which appear more than once.\n"; # cooccurence data print STDOUT ("=" x 80) . "\n"; print STDOUT "CO-OCCURENCE DATA:\n"; print STDOUT ("-" x 80) . "\n"; print STDOUT (" " x 35) . " TOTAL UNIQUE (C > 1)*\n"; print STDOUT ("-" x 80) . "\n"; print STDOUT (" " x 5) . $corpus . $descr . (" " x (19 - length($corpus))) . (" " x (15 - length($totalcooc))) . $totalcooc . (" " x (15 - length($uniquecooc))) . $uniquecooc . (" " x (13 - length($uniquecooc2))) . "(" . $uniquecooc2 . ")\n"; if($twofiles){ print STDOUT (" " x 5) . $testcorpus . " (TESTING)" . (" " x (20 - length($testcorpus))) . (" " x (15 - length($testtotalcooc))) . $testtotalcooc . (" " x (15 - length($testuniquecooc))) . $testuniquecooc . (" " x (13 - length($testuniquecooc2))) . "(" . $testuniquecooc2 . ")\n"; print STDOUT " {TESTING} - {TRAINING} " . (" " x (15 - length($totalunseencooc))) . $totalunseencooc . (" " x (15 - length($uniqueunseencooc))) . $uniqueunseencooc . (" " x (13 - length($uniqueunseencooc2))) . "(" . $uniqueunseencooc2 . ")\n"; } print STDOUT ("-" x 80) . "\n"; print STDOUT "*The number of co-occurences which appear more than once.\n"; # sentence length histograms print STDOUT ("=" x 80) . "\n"; print STDOUT "SENTENCE LENGTH DATA:\n"; print STDOUT ("-" x 80) . "\n"; print STDOUT "$corpus$descr\nSOURCE:\n"; $scale = int(1 + $maxsentences_s / 50); $scale = int(1 + maxsentences_t / 50) if($scale < int(1 + maxsentences_t / 50)); print STDOUT "\tLENGTH\tCOUNT (* = $scale SENTENCES)\n"; for($i = 1; $i <= $maxsentencelen; $i++){ print STDOUT "\t$i\t" . ("*" x int($sentencelens_s{$i} / $scale)). "\n"; } print STDOUT "\nTARGET:\n"; print STDOUT "\tLENGTH\tCOUNT (* = $scale SENTENCES)\n"; for($i = 1; $i <= $maxsentencelen; $i++){ print STDOUT "\t$i\t" . ("*" x int($sentencelens_t{$i} / $scale)). "\n"; } if($twofiles){ print STDOUT ("-" x 80) . "\n"; print STDOUT "$testcorpus (TESTING)\nSOURCE:\n"; $scale = int(1 + $testmaxsentences_s / 50); $scale = int(1 + testmaxsentences_t / 50) if($scale < int(1 + testmaxsentences_t / 50)); print STDOUT "\tLENGTH\tCOUNT (* = $scale SENTENCES)\n"; for($i = 1; $i <= $maxsentencelen; $i++){ print STDOUT "\t$i\t" . ("*" x int($testsentencelens_s{$i} / $scale)). "\n"; } print STDOUT "\nTARGET:\n"; print STDOUT "\tLENGTH\tCOUNT (* = $scale SENTENCES)\n"; for($i = 1; $i <= $maxsentencelen; $i++){ print STDOUT "\t$i\t" . ("*" x int($testsentencelens_t{$i} / $scale)). "\n"; } } print STDOUT ("=" x 80) . "\n";