WUT_Computer_Science/Programming/NLP/test_goldStandard/evalF1.pl

440 lines
15 KiB
Perl
Raw Normal View History

2024-01-06 14:02:40 +01:00
#!/usr/bin/perl
#
# based on:
# Melamed, I. Dan. 1998.
# Manual annotation of translational equivalence: The blinker
# project. Technical Report 98-07, Institute for
# Research in Cognitive Science, Philadelphia,
#
# http://repository.upenn.edu/cgi/viewcontent.cgi?article=1055&context=ircs_reports
#
#
# given:
#
# g = gold standard token:token alignments (produced aligning all
# tokens in chunk:chunk alignments)
#
# s = system token:token alignments (produced aligning all tokens
# in chunk:chunk alignments)
#
# we discard punctuation .,:'`?;"-
#
# we define
# precis(g,s) = | overlap(g,s) | / |s| (eq. 1 in Melamed 98)
# recall(g,s) = | overlap(g,s) | / |g| (eq. 2 in Melamed 98)
# (F1 is equal to eq. 3 in Melamed 98)
#
# where overlap returns the number of token:token alignments in
# common between both sets
#
# g and s can be fuzzy sets, where each token:token alignment is
# weighted as follows:
#
# weight(t1:t2) = 1 / max(fanout(t1),fanout(t2)) (eq. 4 in Melamed 98)
#
# given directed alignments, fanout(t) is the number of token:token
# alignments which have their origin in t #
#
# As we have a different fanout factor in the gold standard pair and
# in the system pair we use the fanout of the sys to compute overlap
# and |s| for precision, and the fanout of the gold standard to
# compute overlap and |g| for recall. #
#
# Precision and recall are computed for all alignments of all pairs
# in one go (i.e. as opposed to aeraging F1 of each sentence pair)
#
# The script provides four evaluation measures:
#
# - F1 where alignment type and score are ignored (F1A)
# - F1 where alignment types need to match, but scores are
# ignored. Match is quantified using Jaccard, as there can be multiple
# tags (FACT,POL). (F1AT)
# - F1 where alignment type is ignored, but each alignment is penalized
# when scores do not match (F1AS)
# - F1 where alignment types need to match, and each alignment is
# penalized when scores do not match. Match is quantified using
# Jaccard, as there can be multiple tags (FACT,POL). In addition the
# following special cases are catered for:
# . there is no type penalty between tags {SPE1, SPE2, REL, SIMI} when
# scores are (0-2]
# . there is no type penalty between EQUI and SIMI/SPE with score 4
# (F1AST)
#
# When type needs to match, a token:token alignment is in the overlap
# iff the types of the alignment in system and gold standard files is
# the same.
#
# When the scores are taken into account, the weight of the
# token:token alignment is penalized for differenes in score between
# the system and gold-standard alignment, as follows: #
#
# weight(t1:t2) = 1 / max(fanout(t1),fanout(t2)) (eq. 4 in Melamed 98)
# * ( 1 - abs(score(t1:t2,sys) - score(t1:t2)) / 5)
#
# Changes:
# v1 Oct. 16 2014
# - first release
# v2 Nov. 7 2014
# - bug fixed: Bug affecting alignments which had multiple types, as
# equalset received tags concatenated by _
# - changes to address the following special case for F1AST:
# . there is no type penalty between tags {SPE1, SPE2, REL, SIMI}
# when both scores are (0-2]
# . there is no type penalty between EQUI and SIMI/SPE with score 4.
# v2 Sep. 7 2015 (minor changes ~ inigo lopez-gazpio)
# - Do not raise warnings when tokens are used in several distinct alignments (allow M:N alignments)
=head1 $0
=head1 SYNOPSIS
evalF1.pl gs system --debug=[01]
Outputs the F1
Example:
$ ./evalF1.pl gs sys --debug=1
Author: Eneko Agirre
Nov. 7, 2014
=cut
use Getopt::Long qw(:config auto_help);
use Pod::Usage;
use warnings;
use strict;
use List::Util qw(max) ;
use File::Basename;
my $DBG = 0 ;
GetOptions("debug=i" => \$DBG)
or
pod2usage() ;
pod2usage if $#ARGV != 1 ;
my $dir = dirname(__FILE__);
my $incorrect = system "perl $dir/wellformed.pl $ARGV[1] > /dev/null" ;
die "\nSys file $ARGV[1] is not well-formed" if $incorrect ;
print "GS: $ARGV[0]\n" if $DBG ;
my $gs = loadalignments($ARGV[0],'gold') ;
print "SYS: $ARGV[1]\n" if $DBG ;
my $sys = loadalignments($ARGV[1],'sys') ;
printf " F1 Ali %6.4f\n", F1($gs,$sys,'') ;
printf " F1 Type %6.4f\n", F1($gs,$sys,'type') ;
printf " F1 Score %6.4f\n", F1($gs,$sys,'score') ;
printf " F1 Typ+Sco %6.4f\n", F1($gs,$sys,'typescore') ;
# the type of alignment only influences the following
# - gs: store string of each token, remove if punctuation
# - sys: use string of each token, remove if punctuation
sub loadalignments {
my ($f,$type) = @_ ;
my $alis = {} ;
my ($id) ;
my ($sent1,$sent2) ;
open(I,$f) or die $! ;
while (<I>) {
chomp ;
# extract pair id, and insert token strings in $alis if gold standard
if (/sentence id="([^\"]*)" /) {
$id = $1 ;
if ($type eq 'gold') {
$sent1 = <I> ; chomp($sent1); $sent1 =~ s/^\/\/ //; $alis->{$id}{"string1"} = [ split(/ /,$sent1) ] ;
$sent2 = <I> ; chomp($sent2); $sent2 =~ s/^\/\/ //; $alis->{$id}{"string2"} = [ split(/ /,$sent2) ] ;
}
}
# parse alignments
if (/<==>/) {
die "contact developer" if not defined $id ;
next if not defined $id ;
# parse alignment
my ($alignment,$types,$score,$comment) = split(/\/\//,$_) ;
my ($tokens1,$tokens2) = split(/<==>/,$alignment) ;
$tokens1 =~ s/^\s+// ; $tokens1 =~ s/\s+$// ;
$tokens2 =~ s/^\s+// ; $tokens2 =~ s/\s+$// ;
$score =~ s/^\s+// ; $score =~ s/\s+$// ;
$types =~ s/^\s+// ; $types =~ s/\s+$// ;
my @tokens1 = split(/\s+/,$tokens1) ;
my @tokens2 = split(/\s+/,$tokens2) ;
my @types = split(/_/,$types) ;
# store chunk alignments, including NOALI
$alis->{$id}{"segments12"}{$tokens1}{$tokens2} = [ @types ] ;
$alis->{$id}{"segments21"}{$tokens2}{$tokens1} = [ @types ] ;
# produce token:token alignments, unless NOALI or ALIC (where one chunk is null, represented by 0)
next if $tokens1[0] == 0 ;
next if $tokens2[0] == 0 ;
# remove punctuation from evaluation
my $tmp ;
if ($type eq 'gold'){ $tmp = $alis } else { $tmp = $gs } ;
@tokens1 = grep { $tmp->{$id}{"string1"}[$_-1] !~ /^[.,:\'\`?;\"-]$/} @tokens1 ;
@tokens2 = grep { $tmp->{$id}{"string2"}[$_-1] !~ /^[.,:\'\`?;\"-]$/} @tokens2 ;
next if ! @tokens1 ;
next if ! @tokens2 ;
# produce token:token alignments and index them by token and by alignmet in both directions
foreach my $t1 (@tokens1) {
foreach my $t2 (@tokens2) {
# store @type separately for all kinds of alignments
$alis->{$id}{"tokens12"}{$t1}{$t2} = [ @types ] ;
$alis->{$id}{"tokens21"}{$t2}{$t1} = [ @types ] ;
$alis->{$id}{"links12"}{"$t1 $t2"} = [ @types ] ;
$alis->{$id}{"links21"}{"$t2 $t1"} = [ @types ] ;
# store $score separately for all kinds of alignments
$alis->{$id}{"tokens12score"}{$t1}{$t2} = $score ;
$alis->{$id}{"tokens21score"}{$t2}{$t1} = $score ;
$alis->{$id}{"links12score"}{"$t1 $t2"} = $score ;
$alis->{$id}{"links21score"}{"$t2 $t1"} = $score ;
}
}
}
}
die "No alignments found in $f, terminating" if (scalar keys %$alis) == 0 ;
print " Number of pairs $f: " . (scalar keys %$alis) . "\n" if $DBG ;
return $alis ;
}
# check set equality for types
sub equalset {
my ($set1,$set2) = @_ ;
my $hash1 = { map { ($_, 1) } @$set1 } ;
my $hash2 = { map { ($_, 1) } @$set2 } ;
my $equal = 1 ;
foreach my $el1 (keys %$hash1) {
if (! $hash2->{$el1} ) { $equal = 0 ; last ; } ;
}
foreach my $el2 (keys %$hash2) {
if (! $hash1->{$el2} ) { $equal = 0 ; last ; } ;
}
print (" type mismatch: " . join("_",@$set1) . " and " . join("_",@$set2). "\n") if $DBG and ! $equal ;
return $equal ;
}
# check jaccard between sets of types, normalizing to lowercase
sub jaccardset {
my ($set1,$set2) = @_ ;
my $hash1 = { map { (lc $_, 1) } @$set1 } ;
my $hash2 = { map { (lc $_, 1) } @$set2 } ;
my $intersect = {} ;
my $union = {} ;
foreach my $el1 (keys %$hash1) {
$union->{$el1} = 1 ;
$intersect->{$el1} = 1 if $hash2->{$el1} ;
}
foreach my $el2 (keys %$hash2) {
$union->{$el2} = 1 ;
$intersect->{$el2} = 1 if $hash1->{$el2} ;
}
my $jaccard = scalar(keys %$intersect) / scalar(keys %$union) ;
return $jaccard ;
}
# check jaccard between sets of types, normalizing to lowercase
# no penalty between tags (SPE1, SPE2, REL, SIMI) when score is (0-2] (F1AST)
# no penalty between EQUI and SIMI/SPE with score 4 (F1AST)
# add 1 to intersection, substract 1 to union
sub jaccardsetNOP {
my ($set1,$set2) = @_ ;
my $hash1 = { map { (lc $_, 1) } @$set1 } ;
my $hash2 = { map { (lc $_, 1) } @$set2 } ;
my $intersect = {} ;
my $union = {} ;
foreach my $el1 (keys %$hash1) {
$union->{$el1} = 1 ;
$intersect->{$el1} = 1 if $hash2->{$el1} ;
}
foreach my $el2 (keys %$hash2) {
$union->{$el2} = 1 ;
$intersect->{$el2} = 1 if $hash1->{$el2} ;
}
my $jaccard = (1 + scalar(keys %$intersect)) / (-1 + scalar(keys %$union)) ;
return $jaccard ;
}
# intersection between two sets
sub intersect {
my ($set1,$set2) = @_ ;
my $hash1 = { map { (lc $_, 1) } @$set1 } ;
my $hash2 = { map { (lc $_, 1) } @$set2 } ;
my $intersect = {} ;
foreach my $el1 (keys %$hash1) {
$intersect->{$el1} = 1 if $hash2->{$el1} ;
}
foreach my $el2 (keys %$hash2) {
$intersect->{$el2} = 1 if $hash1->{$el2} ;
}
return [ keys %$intersect ] ;
}
sub EQUI {
my ($typeset)=@_ ;
foreach my $type (@$typeset) {
return 1 if $type =~ /^EQUI/i ;
}
return 0 ;
}
sub SIMISPE {
my ($typeset)=@_ ;
foreach my $type (@$typeset) {
return 1 if $type =~ /^(SIMI|SPE)/i ;
}
return 0 ;
}
sub SIMISPEREL {
my ($typeset)=@_ ;
foreach my $type (@$typeset) {
return 1 if $type =~ /^(SIMI|SPE|REL)/i ;
}
return 0 ;
}
# equation 4 needs fanout, the number of token:token alignments per token
# [guard]
# [death camp guard]
#
# e.g. fanout($alis->{"1"}{"tokens12"}{"guard"} = { death => [ equi ],
# camp => [ equi ],
# guard => [ equi ]} )
# = 3 ;
# e.g. fanout($alis->{"1"}{"tokens21"}{"guard"} = { death => [ equi ]} )
# = 1 ;
# number of token-token alignments for a given token in a given alignment direction
sub fanout {
my ($ali) = @_ ;
return scalar(keys %$ali) ;
}
# summatory of fan-out factors for all token-token alignments ( eq. 4)
sub countFanOut {
my ($ali) = @_ ;
my $count = 0;
foreach my $token1 (keys %{ $ali->{"tokens12"}}) {
foreach my $token2 (keys %{ $ali->{"tokens12"}{$token1}}) {
$count+= 1/max(fanout($ali->{"tokens12"}{$token1}),fanout($ali->{"tokens21"}{$token2})) ;
}
}
return $count ;
}
# Main function
sub F1 {
my ($alisgs,$alissys,$mode) = @_ ;
my $overlapGS ;
my $overlapSYS ;
my $linkssys ;
print "\n F1 $mode ========\n" if $DBG ;
print " recall per pair (gs, sys)\n" if $DBG ;
foreach my $id (sort {$a <=> $b} keys %$alissys) {
$linkssys += countFanOut($alissys->{$id}) ;
next if not $alisgs->{$id} ;
$overlapSYS += overlap($alissys->{$id},$alisgs->{$id},$id,$mode) ;
}
my $linksgs ;
print " precision per pair (sys, gs)\n" if $DBG ;
foreach my $id (sort {$a <=> $b} keys %$alisgs) {
$linksgs += countFanOut($alisgs->{$id}) ;
next if not $alissys->{$id} ;
$overlapGS += overlap($alisgs->{$id},$alissys->{$id},$id,$mode) ;
}
my $precision = ($linkssys == 0) ? 0 : $overlapSYS / $linkssys ;
my $recall = ($linksgs == 0) ? 0 : $overlapGS / $linksgs ;
my $f1 = ($precision + $recall == 0) ? 0 : 2*$precision*$recall/($precision + $recall) ;
print " F1 overlapSYS: $overlapSYS\n" if $DBG ;
print " F1 system links: $linkssys\n" if $DBG ;
printf " F1 precision: %4.2f\n",$precision if $DBG ;
print " F1 overlapGS: $overlapGS\n" if $DBG ;
print " F1 gs links: $linksgs\n" if $DBG ;
printf " F1 recall: %4.2f\n",$recall if $DBG ;
return $f1 ;
}
# Fuzzy overlap using fanout from first alignment set
sub overlap {
my ($ali1,$ali2,$id,$mode) = @_ ;
my $overlap = 0;
foreach my $token1 (keys %{ $ali1->{"tokens12"}}) {
foreach my $token2 (keys %{ $ali1->{"tokens12"}{$token1}}) {
if ($mode eq "") {
$overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
if $ali2->{"tokens12"}{$token1}{$token2} ;
} elsif ($mode eq "type") {
if ($ali2->{"tokens12"}{$token1}{$token2}) { # and equalset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
my $jaccard = jaccardset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
print (" type mismatch: $jaccard " . join("_",@{$ali1->{"tokens12"}{$token1}{$token2}}) . " and " . join("_",@{$ali2->{"tokens12"}{$token1}{$token2}}). "\n") if $DBG and $jaccard != 1;
$overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
* $jaccard ;
}
} elsif ($mode eq "score") {
$overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
* (1 - abs($ali1->{"tokens12score"}{$token1}{$token2} - $ali2->{"tokens12score"}{$token1}{$token2}) / 5)
if $ali2->{"tokens12"}{$token1}{$token2} ;
} elsif ($mode eq "typescore") {
next if not $ali2->{"tokens12"}{$token1}{$token2} ;
my $typeset1 = $ali1->{"tokens12"}{$token1}{$token2} ;
my $typeset2 = $ali2->{"tokens12"}{$token1}{$token2} ;
my $score1 = $ali1->{"tokens12score"}{$token1}{$token2};
my $score2 = $ali2->{"tokens12score"}{$token1}{$token2};
my $overlapincrease = 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
* (1 - abs($score1 - $score2) / 5) ;
my $jaccard ;
if ((EQUI($typeset1) and SIMISPE($typeset2) and ($score2>=4))
or (EQUI($typeset2) and SIMISPE($typeset1) and ($score1>=4))) {
# no type penalty between EQUI and SIMI/SPE with score 4 (F1AST)
$jaccard = jaccardset($typeset1,$typeset2) ; # jaccardsetNOP($typeset1,$typeset2) ;
}
elsif (SIMISPEREL($typeset1) and SIMISPEREL($typeset2) and not(SIMISPEREL(intersect($typeset1,$typeset2)))
and ($score1 < 3) and ($score2 < 3)) {
# no type penalty between tags (SPE1, SPE2, REL, SIMI) when score is (0-2] (F1AST)
# and ali1 and ali2 have different type.
$jaccard = jaccardset($typeset1,$typeset2) ; # jaccardsetNOP($typeset1,$typeset2) ;
}
else { # standard match
$jaccard = jaccardset($typeset1,$typeset2) ;
# and equalset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
}
print (" type mismatch: $jaccard " . join("_",@{$typeset1}) . " ($score1) and " . join("_",@{$typeset2}). " ($score2)\n") if $DBG and $jaccard != 1;
$overlap+= $overlapincrease*$jaccard ;
} else { die }
}
}
if ($DBG and $id) {
my ($total,$precORrecall) ;
printf " " ;
printf "%2d: ",$id ;
$total = countFanOut($ali1) ;
if ($total) {
$precORrecall = $overlap / $total ;
} else {
$precORrecall = 0 ;
}
printf "ov:%5.2f tot:%5.2f p/r:%4.2f\n",$overlap,$total,$precORrecall ;
}
return $overlap ;
}