WUT_Computer_Science/test_goldStandard/wellformed.pl
2024-01-06 14:02:40 +01:00

184 lines
5.2 KiB
Perl
Executable File

#!/usr/bin/perl
#
=head1 wellformed.pl
=head1 SYNOPSIS
wellformed.pl file.wa --stats=[01]
Checks well-formedness of .wa files
focusing on alignment part.
Prints statistics if correct
Example:
$ ./wellformed sys --stats=1
Author: Eneko Agirre
Oct. 12, 2014
Updated: Inigo Lopez
July 20, 2015
changes:
Do not raise an error if chunks are made of non-consecutive tokens (To allow M:N relations)
Do not raise warnings when token ids are used in several distinct alignments (To allow M:N relations)
Format example (... used for omissions):
<sentence id ="1" status="">
// ...
// ...
<source>
...
</source>
<translation>
...
</translation>
<alignment>
7 <==> 0 // NOALI // 0 // . <==> -not aligned-
0 <==> 7 8 // NOALI // NIL // -not aligned- <==> high up
3 <==> 6 // EQUI // 5 // standing <==> is
4 5 6 <==> 9 10 11 // EQUI // 5 // on tree branches <==> on tree branches
0 <==> 12 // NOALI // 0 // -not aligned- <==> .
1 2 <==> 1 2 3 4 5 // SPE2 // 4 // A cat <==> A black and white cat
</alignment>
=cut
use Getopt::Long qw(:config auto_help);
use Pod::Usage;
use warnings;
use strict;
use List::Util qw(max) ;
use Scalar::Util qw(looks_like_number);
my $DBG = 0 ;
GetOptions("stats=i" => \$DBG)
or
pod2usage() ;
pod2usage if $#ARGV != 0 ;
my %MAINTYPES = ('EQUI'=>1,'OPPO'=>1,'SPE1'=>1,'SPE2'=>1,'SIMI'=>1,'REL'=>1,'NOALI'=>1,'ALIC'=>1) ;
my %OPTTYPES = ('FACT'=>1,'POL'=>1) ;
my $stats = {} ;
my $correct = loadalignments($ARGV[0]) ;
printf "Well-formedness of %s: %s\n", $ARGV[0], $correct ;
printstats($stats) if $DBG ;
if ($correct eq "correct") {
exit(0) ; }
else {
exit(1) ; }
# global variables for reporting error
my $id ;
my $line ;
sub loadalignments {
my ($f) = @_ ;
my $alis = {} ;
my $correct = "correct" ;
open(I,$f) or die $! ;
while (<I>) {
chomp ;
$id = $1 if /sentence id="([^\"]*)" / ;
if (/<==>/) {
$line = $_ ;
raiseerror("Can\'t find id") if not defined $id ;
$correct = "incorrect" if not defined $id ;
my ($alignment,$types,$score,$comment) = split(/\/\//,$_) ;
my ($tokens1,$tokens2) = split(/<==>/,$alignment) ;
$tokens1 =~ s/^\s+// ; $tokens1 =~ s/\s+$// ;
$tokens2 =~ s/^\s+// ; $tokens2 =~ s/\s+$// ;
$score =~ s/^\s+// ; $score =~ s/\s+$// ;
$types =~ s/^\s+// ; $types =~ s/\s+$// ;
my @tokens1 = split(/\s+/,$tokens1) ;
my @tokens2 = split(/\s+/,$tokens2) ;
my @types = split(/_/,$types) ;
if (not @tokens1) {raiseerror("wrong alignment") ; $correct = "incorrect" } ;
if (not @tokens2) {raiseerror("wrong alignment") ; $correct = "incorrect" } ;
foreach (@tokens1,@tokens2) { if (! /^\d+$/) { raiseerror("wrong token"); $correct = "incorrect" } ;} ;
if (not(looks_like_number($score)) and $score ne "NIL") { raiseerror("wrong score") ; $correct = "incorrect" } ;
if (looks_like_number($score) and ($score<0 or $score>5)) { raiseerror("wrong score") ; $correct = "incorrect" } ;
# only allow NIL for NOALI
if ($score =~ /^NIL$/) { if ($types !~ /NOALI|ALIC/) { raiseerror("wrong score") ; $correct = "incorrect" } } ;
if (sprintf("%d",scalar(@types)) !~ /^[1-3]$/) {
raiseerror("wrong number of types (one main type, plus two possible optional types)") ;
$correct = "incorrect ";
}
my $maintypeN ;
foreach (@types) {
if ((! $MAINTYPES{$_}) and (! $OPTTYPES{$_})) {raiseerror("wrong type") ; $correct = "incorrect" } ;
$maintypeN++ if $MAINTYPES{$_} ;
}
if ($maintypeN ne 1) {
raiseerror("there needs to be exactly one main type") ;
$correct = "incorrect" ;
}
# record stats only if correct
if ($correct eq "correct") {
$stats->{'aligns'}{$id}++ ;
foreach (@types) {$stats->{'types'}{$_}++} ;
}
# segment alignments include NOALI
$alis->{$id}{"segments12"}{$tokens1}{$tokens2} = [ @types ] ;
$alis->{$id}{"segments21"}{$tokens2}{$tokens1} = [ @types ] ;
# don't introduce NOALI as token alighments
next if $tokens1[0] == 0 ;
next if $tokens2[0] == 0 ;
# produce token alignments and store them both by token and as link
foreach my $t1 (@tokens1) {
foreach my $t2 (@tokens2) {
$alis->{$id}{"tokens12"}{$t1}{$t2} = [ @types ] ;
$alis->{$id}{"tokens21"}{$t2}{$t1} = [ @types ] ;
$alis->{$id}{"links12"}{"$t1 $t2"} = [ @types ] ;
$alis->{$id}{"links21"}{"$t2 $t1"} = [ @types ] ;
}
}
}
}
return $correct ;
}
sub printstats {
my ($stats) = @_ ;
my ($pairswithalignment,$alignments,%alignments) ;
printf "\n" ;
foreach (keys %{$stats->{'aligns'}}) {
$pairswithalignment++ ;
$alignments+= $stats->{'aligns'}{$_} ;
}
printf " stats: sentence pairs %d\n",$pairswithalignment ;
printf " stats: alignments %d\n",$alignments ;
foreach (sort keys %{$stats->{'types'}}) {
printf " stats: type %5s %d\n",$_,$stats->{'types'}{$_} ;
}
printf "\n" ;
}
sub consecutive {
my @idx = sort {$a<=>$b} @_ ;
my $total ;
foreach (@idx) {$total+=$_ } ;
return ($total/(-$idx[0]+$idx[$#idx]+1) == ($idx[0]+$idx[$#idx]) / 2) ;
}
sub raiseerror {
my ($message) = @_ ;
warn "$message ($id: $line)\n" ;
}