mirror of
https://github.com/kuhyx/WUT_Computer_Science.git
synced 2026-07-04 16:23:11 +02:00
440 lines
15 KiB
Perl
Executable File
440 lines
15 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
#
|
|
# based on:
|
|
# Melamed, I. Dan. 1998.
|
|
# Manual annotation of translational equivalence: The blinker
|
|
# project. Technical Report 98-07, Institute for
|
|
# Research in Cognitive Science, Philadelphia,
|
|
#
|
|
# http://repository.upenn.edu/cgi/viewcontent.cgi?article=1055&context=ircs_reports
|
|
#
|
|
#
|
|
# given:
|
|
#
|
|
# g = gold standard token:token alignments (produced aligning all
|
|
# tokens in chunk:chunk alignments)
|
|
#
|
|
# s = system token:token alignments (produced aligning all tokens
|
|
# in chunk:chunk alignments)
|
|
#
|
|
# we discard punctuation .,:'`?;"-
|
|
#
|
|
# we define
|
|
# precis(g,s) = | overlap(g,s) | / |s| (eq. 1 in Melamed 98)
|
|
# recall(g,s) = | overlap(g,s) | / |g| (eq. 2 in Melamed 98)
|
|
# (F1 is equal to eq. 3 in Melamed 98)
|
|
#
|
|
# where overlap returns the number of token:token alignments in
|
|
# common between both sets
|
|
#
|
|
# g and s can be fuzzy sets, where each token:token alignment is
|
|
# weighted as follows:
|
|
#
|
|
# weight(t1:t2) = 1 / max(fanout(t1),fanout(t2)) (eq. 4 in Melamed 98)
|
|
#
|
|
# given directed alignments, fanout(t) is the number of token:token
|
|
# alignments which have their origin in t #
|
|
#
|
|
# As we have a different fanout factor in the gold standard pair and
|
|
# in the system pair we use the fanout of the sys to compute overlap
|
|
# and |s| for precision, and the fanout of the gold standard to
|
|
# compute overlap and |g| for recall. #
|
|
#
|
|
# Precision and recall are computed for all alignments of all pairs
|
|
# in one go (i.e. as opposed to aeraging F1 of each sentence pair)
|
|
#
|
|
# The script provides four evaluation measures:
|
|
#
|
|
# - F1 where alignment type and score are ignored (F1A)
|
|
# - F1 where alignment types need to match, but scores are
|
|
# ignored. Match is quantified using Jaccard, as there can be multiple
|
|
# tags (FACT,POL). (F1AT)
|
|
# - F1 where alignment type is ignored, but each alignment is penalized
|
|
# when scores do not match (F1AS)
|
|
# - F1 where alignment types need to match, and each alignment is
|
|
# penalized when scores do not match. Match is quantified using
|
|
# Jaccard, as there can be multiple tags (FACT,POL). In addition the
|
|
# following special cases are catered for:
|
|
# . there is no type penalty between tags {SPE1, SPE2, REL, SIMI} when
|
|
# scores are (0-2]
|
|
# . there is no type penalty between EQUI and SIMI/SPE with score 4
|
|
# (F1AST)
|
|
|
|
#
|
|
# When type needs to match, a token:token alignment is in the overlap
|
|
# iff the types of the alignment in system and gold standard files is
|
|
# the same.
|
|
#
|
|
# When the scores are taken into account, the weight of the
|
|
# token:token alignment is penalized for differenes in score between
|
|
# the system and gold-standard alignment, as follows: #
|
|
#
|
|
# weight(t1:t2) = 1 / max(fanout(t1),fanout(t2)) (eq. 4 in Melamed 98)
|
|
# * ( 1 - abs(score(t1:t2,sys) - score(t1:t2)) / 5)
|
|
#
|
|
# Changes:
|
|
# v1 Oct. 16 2014
|
|
# - first release
|
|
|
|
# v2 Nov. 7 2014
|
|
# - bug fixed: Bug affecting alignments which had multiple types, as
|
|
# equalset received tags concatenated by _
|
|
# - changes to address the following special case for F1AST:
|
|
# . there is no type penalty between tags {SPE1, SPE2, REL, SIMI}
|
|
# when both scores are (0-2]
|
|
# . there is no type penalty between EQUI and SIMI/SPE with score 4.
|
|
|
|
# v2 Sep. 7 2015 (minor changes ~ inigo lopez-gazpio)
|
|
# - Do not raise warnings when tokens are used in several distinct alignments (allow M:N alignments)
|
|
|
|
|
|
=head1 $0
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
evalF1.pl gs system --debug=[01]
|
|
|
|
Outputs the F1
|
|
|
|
Example:
|
|
|
|
$ ./evalF1.pl gs sys --debug=1
|
|
|
|
Author: Eneko Agirre
|
|
|
|
Nov. 7, 2014
|
|
|
|
|
|
=cut
|
|
|
|
|
|
use Getopt::Long qw(:config auto_help);
|
|
use Pod::Usage;
|
|
use warnings;
|
|
use strict;
|
|
use List::Util qw(max) ;
|
|
use File::Basename;
|
|
|
|
my $DBG = 0 ;
|
|
|
|
GetOptions("debug=i" => \$DBG)
|
|
or
|
|
pod2usage() ;
|
|
|
|
pod2usage if $#ARGV != 1 ;
|
|
|
|
my $dir = dirname(__FILE__);
|
|
my $incorrect = system "perl $dir/wellformed.pl $ARGV[1] > /dev/null" ;
|
|
die "\nSys file $ARGV[1] is not well-formed" if $incorrect ;
|
|
|
|
print "GS: $ARGV[0]\n" if $DBG ;
|
|
my $gs = loadalignments($ARGV[0],'gold') ;
|
|
|
|
print "SYS: $ARGV[1]\n" if $DBG ;
|
|
my $sys = loadalignments($ARGV[1],'sys') ;
|
|
|
|
printf " F1 Ali %6.4f\n", F1($gs,$sys,'') ;
|
|
printf " F1 Type %6.4f\n", F1($gs,$sys,'type') ;
|
|
printf " F1 Score %6.4f\n", F1($gs,$sys,'score') ;
|
|
printf " F1 Typ+Sco %6.4f\n", F1($gs,$sys,'typescore') ;
|
|
|
|
|
|
# the type of alignment only influences the following
|
|
# - gs: store string of each token, remove if punctuation
|
|
# - sys: use string of each token, remove if punctuation
|
|
sub loadalignments {
|
|
my ($f,$type) = @_ ;
|
|
my $alis = {} ;
|
|
my ($id) ;
|
|
my ($sent1,$sent2) ;
|
|
open(I,$f) or die $! ;
|
|
while (<I>) {
|
|
chomp ;
|
|
# extract pair id, and insert token strings in $alis if gold standard
|
|
if (/sentence id="([^\"]*)" /) {
|
|
$id = $1 ;
|
|
if ($type eq 'gold') {
|
|
$sent1 = <I> ; chomp($sent1); $sent1 =~ s/^\/\/ //; $alis->{$id}{"string1"} = [ split(/ /,$sent1) ] ;
|
|
$sent2 = <I> ; chomp($sent2); $sent2 =~ s/^\/\/ //; $alis->{$id}{"string2"} = [ split(/ /,$sent2) ] ;
|
|
}
|
|
}
|
|
|
|
# parse alignments
|
|
if (/<==>/) {
|
|
die "contact developer" if not defined $id ;
|
|
next if not defined $id ;
|
|
# parse alignment
|
|
my ($alignment,$types,$score,$comment) = split(/\/\//,$_) ;
|
|
my ($tokens1,$tokens2) = split(/<==>/,$alignment) ;
|
|
$tokens1 =~ s/^\s+// ; $tokens1 =~ s/\s+$// ;
|
|
$tokens2 =~ s/^\s+// ; $tokens2 =~ s/\s+$// ;
|
|
$score =~ s/^\s+// ; $score =~ s/\s+$// ;
|
|
$types =~ s/^\s+// ; $types =~ s/\s+$// ;
|
|
my @tokens1 = split(/\s+/,$tokens1) ;
|
|
my @tokens2 = split(/\s+/,$tokens2) ;
|
|
my @types = split(/_/,$types) ;
|
|
|
|
# store chunk alignments, including NOALI
|
|
$alis->{$id}{"segments12"}{$tokens1}{$tokens2} = [ @types ] ;
|
|
$alis->{$id}{"segments21"}{$tokens2}{$tokens1} = [ @types ] ;
|
|
|
|
# produce token:token alignments, unless NOALI or ALIC (where one chunk is null, represented by 0)
|
|
next if $tokens1[0] == 0 ;
|
|
next if $tokens2[0] == 0 ;
|
|
|
|
# remove punctuation from evaluation
|
|
my $tmp ;
|
|
if ($type eq 'gold'){ $tmp = $alis } else { $tmp = $gs } ;
|
|
@tokens1 = grep { $tmp->{$id}{"string1"}[$_-1] !~ /^[.,:\'\`?;\"-]$/} @tokens1 ;
|
|
@tokens2 = grep { $tmp->{$id}{"string2"}[$_-1] !~ /^[.,:\'\`?;\"-]$/} @tokens2 ;
|
|
next if ! @tokens1 ;
|
|
next if ! @tokens2 ;
|
|
|
|
# produce token:token alignments and index them by token and by alignmet in both directions
|
|
foreach my $t1 (@tokens1) {
|
|
foreach my $t2 (@tokens2) {
|
|
# store @type separately for all kinds of alignments
|
|
$alis->{$id}{"tokens12"}{$t1}{$t2} = [ @types ] ;
|
|
$alis->{$id}{"tokens21"}{$t2}{$t1} = [ @types ] ;
|
|
$alis->{$id}{"links12"}{"$t1 $t2"} = [ @types ] ;
|
|
$alis->{$id}{"links21"}{"$t2 $t1"} = [ @types ] ;
|
|
# store $score separately for all kinds of alignments
|
|
$alis->{$id}{"tokens12score"}{$t1}{$t2} = $score ;
|
|
$alis->{$id}{"tokens21score"}{$t2}{$t1} = $score ;
|
|
$alis->{$id}{"links12score"}{"$t1 $t2"} = $score ;
|
|
$alis->{$id}{"links21score"}{"$t2 $t1"} = $score ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
die "No alignments found in $f, terminating" if (scalar keys %$alis) == 0 ;
|
|
print " Number of pairs $f: " . (scalar keys %$alis) . "\n" if $DBG ;
|
|
return $alis ;
|
|
}
|
|
|
|
|
|
# check set equality for types
|
|
sub equalset {
|
|
my ($set1,$set2) = @_ ;
|
|
my $hash1 = { map { ($_, 1) } @$set1 } ;
|
|
my $hash2 = { map { ($_, 1) } @$set2 } ;
|
|
my $equal = 1 ;
|
|
foreach my $el1 (keys %$hash1) {
|
|
if (! $hash2->{$el1} ) { $equal = 0 ; last ; } ;
|
|
}
|
|
foreach my $el2 (keys %$hash2) {
|
|
if (! $hash1->{$el2} ) { $equal = 0 ; last ; } ;
|
|
}
|
|
print (" type mismatch: " . join("_",@$set1) . " and " . join("_",@$set2). "\n") if $DBG and ! $equal ;
|
|
return $equal ;
|
|
}
|
|
|
|
# check jaccard between sets of types, normalizing to lowercase
|
|
sub jaccardset {
|
|
my ($set1,$set2) = @_ ;
|
|
my $hash1 = { map { (lc $_, 1) } @$set1 } ;
|
|
my $hash2 = { map { (lc $_, 1) } @$set2 } ;
|
|
my $intersect = {} ;
|
|
my $union = {} ;
|
|
foreach my $el1 (keys %$hash1) {
|
|
$union->{$el1} = 1 ;
|
|
$intersect->{$el1} = 1 if $hash2->{$el1} ;
|
|
}
|
|
foreach my $el2 (keys %$hash2) {
|
|
$union->{$el2} = 1 ;
|
|
$intersect->{$el2} = 1 if $hash1->{$el2} ;
|
|
}
|
|
my $jaccard = scalar(keys %$intersect) / scalar(keys %$union) ;
|
|
return $jaccard ;
|
|
}
|
|
|
|
|
|
|
|
# check jaccard between sets of types, normalizing to lowercase
|
|
# no penalty between tags (SPE1, SPE2, REL, SIMI) when score is (0-2] (F1AST)
|
|
# no penalty between EQUI and SIMI/SPE with score 4 (F1AST)
|
|
# add 1 to intersection, substract 1 to union
|
|
sub jaccardsetNOP {
|
|
my ($set1,$set2) = @_ ;
|
|
my $hash1 = { map { (lc $_, 1) } @$set1 } ;
|
|
my $hash2 = { map { (lc $_, 1) } @$set2 } ;
|
|
my $intersect = {} ;
|
|
my $union = {} ;
|
|
foreach my $el1 (keys %$hash1) {
|
|
$union->{$el1} = 1 ;
|
|
$intersect->{$el1} = 1 if $hash2->{$el1} ;
|
|
}
|
|
foreach my $el2 (keys %$hash2) {
|
|
$union->{$el2} = 1 ;
|
|
$intersect->{$el2} = 1 if $hash1->{$el2} ;
|
|
}
|
|
my $jaccard = (1 + scalar(keys %$intersect)) / (-1 + scalar(keys %$union)) ;
|
|
return $jaccard ;
|
|
}
|
|
|
|
|
|
# intersection between two sets
|
|
sub intersect {
|
|
my ($set1,$set2) = @_ ;
|
|
my $hash1 = { map { (lc $_, 1) } @$set1 } ;
|
|
my $hash2 = { map { (lc $_, 1) } @$set2 } ;
|
|
my $intersect = {} ;
|
|
foreach my $el1 (keys %$hash1) {
|
|
$intersect->{$el1} = 1 if $hash2->{$el1} ;
|
|
}
|
|
foreach my $el2 (keys %$hash2) {
|
|
$intersect->{$el2} = 1 if $hash1->{$el2} ;
|
|
}
|
|
return [ keys %$intersect ] ;
|
|
}
|
|
|
|
sub EQUI {
|
|
my ($typeset)=@_ ;
|
|
foreach my $type (@$typeset) {
|
|
return 1 if $type =~ /^EQUI/i ;
|
|
}
|
|
return 0 ;
|
|
}
|
|
|
|
sub SIMISPE {
|
|
my ($typeset)=@_ ;
|
|
foreach my $type (@$typeset) {
|
|
return 1 if $type =~ /^(SIMI|SPE)/i ;
|
|
}
|
|
return 0 ;
|
|
}
|
|
|
|
sub SIMISPEREL {
|
|
my ($typeset)=@_ ;
|
|
foreach my $type (@$typeset) {
|
|
return 1 if $type =~ /^(SIMI|SPE|REL)/i ;
|
|
}
|
|
return 0 ;
|
|
}
|
|
|
|
# equation 4 needs fanout, the number of token:token alignments per token
|
|
# [guard]
|
|
# [death camp guard]
|
|
#
|
|
# e.g. fanout($alis->{"1"}{"tokens12"}{"guard"} = { death => [ equi ],
|
|
# camp => [ equi ],
|
|
# guard => [ equi ]} )
|
|
# = 3 ;
|
|
# e.g. fanout($alis->{"1"}{"tokens21"}{"guard"} = { death => [ equi ]} )
|
|
# = 1 ;
|
|
|
|
# number of token-token alignments for a given token in a given alignment direction
|
|
sub fanout {
|
|
my ($ali) = @_ ;
|
|
return scalar(keys %$ali) ;
|
|
}
|
|
|
|
# summatory of fan-out factors for all token-token alignments ( eq. 4)
|
|
sub countFanOut {
|
|
my ($ali) = @_ ;
|
|
my $count = 0;
|
|
foreach my $token1 (keys %{ $ali->{"tokens12"}}) {
|
|
foreach my $token2 (keys %{ $ali->{"tokens12"}{$token1}}) {
|
|
$count+= 1/max(fanout($ali->{"tokens12"}{$token1}),fanout($ali->{"tokens21"}{$token2})) ;
|
|
}
|
|
}
|
|
return $count ;
|
|
}
|
|
|
|
# Main function
|
|
sub F1 {
|
|
my ($alisgs,$alissys,$mode) = @_ ;
|
|
my $overlapGS ;
|
|
my $overlapSYS ;
|
|
my $linkssys ;
|
|
print "\n F1 $mode ========\n" if $DBG ;
|
|
print " recall per pair (gs, sys)\n" if $DBG ;
|
|
foreach my $id (sort {$a <=> $b} keys %$alissys) {
|
|
$linkssys += countFanOut($alissys->{$id}) ;
|
|
next if not $alisgs->{$id} ;
|
|
$overlapSYS += overlap($alissys->{$id},$alisgs->{$id},$id,$mode) ;
|
|
}
|
|
my $linksgs ;
|
|
print " precision per pair (sys, gs)\n" if $DBG ;
|
|
foreach my $id (sort {$a <=> $b} keys %$alisgs) {
|
|
$linksgs += countFanOut($alisgs->{$id}) ;
|
|
next if not $alissys->{$id} ;
|
|
$overlapGS += overlap($alisgs->{$id},$alissys->{$id},$id,$mode) ;
|
|
}
|
|
my $precision = ($linkssys == 0) ? 0 : $overlapSYS / $linkssys ;
|
|
my $recall = ($linksgs == 0) ? 0 : $overlapGS / $linksgs ;
|
|
my $f1 = ($precision + $recall == 0) ? 0 : 2*$precision*$recall/($precision + $recall) ;
|
|
print " F1 overlapSYS: $overlapSYS\n" if $DBG ;
|
|
print " F1 system links: $linkssys\n" if $DBG ;
|
|
printf " F1 precision: %4.2f\n",$precision if $DBG ;
|
|
print " F1 overlapGS: $overlapGS\n" if $DBG ;
|
|
print " F1 gs links: $linksgs\n" if $DBG ;
|
|
printf " F1 recall: %4.2f\n",$recall if $DBG ;
|
|
return $f1 ;
|
|
}
|
|
|
|
# Fuzzy overlap using fanout from first alignment set
|
|
sub overlap {
|
|
my ($ali1,$ali2,$id,$mode) = @_ ;
|
|
my $overlap = 0;
|
|
foreach my $token1 (keys %{ $ali1->{"tokens12"}}) {
|
|
foreach my $token2 (keys %{ $ali1->{"tokens12"}{$token1}}) {
|
|
if ($mode eq "") {
|
|
$overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
|
|
if $ali2->{"tokens12"}{$token1}{$token2} ;
|
|
} elsif ($mode eq "type") {
|
|
if ($ali2->{"tokens12"}{$token1}{$token2}) { # and equalset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
|
|
my $jaccard = jaccardset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
|
|
print (" type mismatch: $jaccard " . join("_",@{$ali1->{"tokens12"}{$token1}{$token2}}) . " and " . join("_",@{$ali2->{"tokens12"}{$token1}{$token2}}). "\n") if $DBG and $jaccard != 1;
|
|
$overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
|
|
* $jaccard ;
|
|
}
|
|
} elsif ($mode eq "score") {
|
|
$overlap+= 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
|
|
* (1 - abs($ali1->{"tokens12score"}{$token1}{$token2} - $ali2->{"tokens12score"}{$token1}{$token2}) / 5)
|
|
if $ali2->{"tokens12"}{$token1}{$token2} ;
|
|
} elsif ($mode eq "typescore") {
|
|
next if not $ali2->{"tokens12"}{$token1}{$token2} ;
|
|
my $typeset1 = $ali1->{"tokens12"}{$token1}{$token2} ;
|
|
my $typeset2 = $ali2->{"tokens12"}{$token1}{$token2} ;
|
|
my $score1 = $ali1->{"tokens12score"}{$token1}{$token2};
|
|
my $score2 = $ali2->{"tokens12score"}{$token1}{$token2};
|
|
my $overlapincrease = 1/max(fanout($ali1->{"tokens12"}{$token1}),fanout($ali1->{"tokens21"}{$token2}))
|
|
* (1 - abs($score1 - $score2) / 5) ;
|
|
my $jaccard ;
|
|
if ((EQUI($typeset1) and SIMISPE($typeset2) and ($score2>=4))
|
|
or (EQUI($typeset2) and SIMISPE($typeset1) and ($score1>=4))) {
|
|
# no type penalty between EQUI and SIMI/SPE with score 4 (F1AST)
|
|
$jaccard = jaccardset($typeset1,$typeset2) ; # jaccardsetNOP($typeset1,$typeset2) ;
|
|
}
|
|
elsif (SIMISPEREL($typeset1) and SIMISPEREL($typeset2) and not(SIMISPEREL(intersect($typeset1,$typeset2)))
|
|
and ($score1 < 3) and ($score2 < 3)) {
|
|
# no type penalty between tags (SPE1, SPE2, REL, SIMI) when score is (0-2] (F1AST)
|
|
# and ali1 and ali2 have different type.
|
|
$jaccard = jaccardset($typeset1,$typeset2) ; # jaccardsetNOP($typeset1,$typeset2) ;
|
|
}
|
|
else { # standard match
|
|
$jaccard = jaccardset($typeset1,$typeset2) ;
|
|
# and equalset($ali1->{"tokens12"}{$token1}{$token2},$ali2->{"tokens12"}{$token1}{$token2}) ;
|
|
}
|
|
print (" type mismatch: $jaccard " . join("_",@{$typeset1}) . " ($score1) and " . join("_",@{$typeset2}). " ($score2)\n") if $DBG and $jaccard != 1;
|
|
$overlap+= $overlapincrease*$jaccard ;
|
|
} else { die }
|
|
}
|
|
}
|
|
if ($DBG and $id) {
|
|
my ($total,$precORrecall) ;
|
|
printf " " ;
|
|
printf "%2d: ",$id ;
|
|
$total = countFanOut($ali1) ;
|
|
if ($total) {
|
|
$precORrecall = $overlap / $total ;
|
|
} else {
|
|
$precORrecall = 0 ;
|
|
}
|
|
printf "ov:%5.2f tot:%5.2f p/r:%4.2f\n",$overlap,$total,$precORrecall ;
|
|
}
|
|
return $overlap ;
|
|
}
|
|
|