#!/usr/local/bin/perl # make sure that all output is flushed. $| = 1; $startTime = time; prepro(); print "STARTING SEARCH\n"; foreach $fileName (@preFileNames) { open(INFILE, $fileName); $files++; while ($line = ) { # look for the anchor pattern if ($line =~ /$anchorPattern/) { # check if the pattern occurs in the right track if ($pTrackLabel[0] =~ /ANY TRACK/ || $line =~ /^$pTrackLabel[0]/) { search(); # take a closer look at this file last; # and continue with the next file } } } close(INFILE); # report progress $p = $files / ($#preFileNames + 1); print "PROGRESS $p\n"; } $duration = time - $startTime; print "found $hits hits in $files files in $duration seconds\n"; print "READY\n"; # do a search in a specific file from which we know that it contains the anchor pattern sub search { # read the file into an array open(SEARCHFILE, $fileName); @lines = ; close(SEARCHFILE); $nrOfLines = $#lines + 1; for($i = 0; $i < $nrOfLines; $i++) { # look for all occurences of the anchor pattern in this line (could be more than 1) while ($lines[$i] =~ /$anchorPattern/g) { # remember information regarding the position of this hit $hitBegin = length $`; $hitEnd = $hitBegin + length $&; # ignore the block nr. info lines if they accidently contain the anchor pattern if ($lines[$i] =~ /^\d/) { next; } #check if the hit is in the right track and in the content part of the preprocessed data if ($pTrackLabel[0] =~ /ANY TRACK/ || index($lines[$i], $pTrackLabel[0]) == 0 && $hitBegin > 3) { # get the hit info for the anchor pattern $boolHit[0] = 1; $trackHit[0] = $lines[$i]; ($utteranceNrHit[0], $wordNrHit[0], $lengthHit[0]) = getPositionInfo($i, $hitBegin); # look for hits for the other patterns for ($j = 1; $j < $nrOfPatterns; $j++) { # find the first and last line of this patterns scope ($from, $to) = findRange($utteranceNrHit[0], $pScope[$j]); # get a list with all the hits for this pattern within its scope $hitsForPattern[$j] = rangeSearch($pRegExp[$j], $pTrackLabel[$j], $from, $to); } # try to find a combination of hits that evaluates to a true queryBool evaluateHits(); } # make sure the search for the anchor pattern continues at the right place pos $lines[$i] = $hitBegin + 1; } } } # tries all combinations of hits for all the patterns until a true queryBool is found sub evaluateHits { my $i; #initialize the hitIndices $hitIndex[1] = -1; for ($i = 2; $i < $nrOfPatterns; $i++) { $hitIndex[$i] = 0; } do { # calculate the next combination of hitIndices # this loop systematicaly runs through all possible combinations of hit indices for ($i = 1; $i < $nrOfPatterns; $i++) { if ($hitIndex[$i] < $#{$hitsForPattern[$i]}) { $hitIndex[$i]++; last; } else { $hitIndex[$i] = 0; } } # set all the queryBool parameters for the current combination of hits for ($i = 1; $i < $nrOfPatterns; $i++) { ($boolHit[$i], $utteranceNrHit[$i], $wordNrHit[$i], $lengthHit[$i]) = @{$hitsForPattern[$i]->[$hitIndex[$i]]}; } # if this combination gives a true queryBool then show the hit info # and continue with the next ANCHOR pattern search. if (eval($queryBool)) { $hits++; printHitInfo(); return; } } while (!triedAllCombinations()); } # check if all combinations of hits are evaluated sub triedAllCombinations { my $i; for ($i = 1; $i < $nrOfPatterns; $i++) { if ($hitIndex[$i] < $#{$hitsForPattern[$i]}) { return 0; # false } } return 1; # true } sub findRange { my ($anchorBlockNr, $scope) = @_; my $from = $scope; $from =~ s/ .*//; my $to = $scope; $to =~ s/.* //; # scope mag geen spaties aan het einde hebben, dit moet beter!!! my $firstLine; if ($from =~ /\-X/) { $firstLine = 0; } else { my $firstBlockNr = $anchorBlockNr + $from; if ($firstBlockNr < 0) { $firstBlockNr = 0; } my $i = 0; while ($lines[$i] !~ /^$firstBlockNr /) { $i++; } $firstLine = $i + 1; } my $lastLine; if ($to =~ /\+X/) { $lastLine = $#lines; } else { my $lastBlockNr = $anchorBlockNr + $to + 1; my $i = 0; while ($i <= $#lines && $lines[$i] !~ /^$lastBlockNr /) { $i++; } $lastLine = $i - 1; } return ($firstLine, $lastLine); } sub rangeSearch { my ($pattern, $trackLabel, $rangeBegin, $rangeEnd) = @_; my @hits = (); my $hitBegin; my $utteranceNr; my $utteranceLength; my $wordNr; my $i; for($i = $rangeBegin; $i <= $rangeEnd; $i++) { pos $lines[$i] = 0; while ($lines[$i] =~ /$pattern/g) { $hitBegin = length $`; if ($trackLabel =~ /ANY TRACK/ || index($lines[$i], $trackLabel) == 0) { ($utteranceNr, $wordNr, $utteranceLength) = getPositionInfo($i, $hitBegin); push @hits, [1, $utteranceNr, $wordNr, $utteranceLength]; } pos $lines[$i] = $hitBegin + 1; } } if ($#hits == -1) { push @hits, [0, "NOT FOUND\n", -1, -1]; } return \@hits; } sub getPositionInfo { my ($lineNr, $hitBegin) = @_; # find the block number my $i = $lineNr; while ($lines[$i] !~ /^\d/) { $i--; } my $blockNr = $lines[$i]; $blockNr =~ s/ .*//; chop($blockNr); # find the word number my @words = split /\s+/, $lines[$lineNr]; my $length = 0; my $wordNr = 0; while ($length <= $hitBegin) { $length += (length $words[$wordNr++]) + 1; # +1 because of the space characters } $subst = substr($lines[$lineNr], $hitBegin, 1); if (substr($lines[$lineNr], $hitBegin, 1) eq ' ') { $wordNr -= 1; } else { $wordNr -= 2; } return ($blockNr, $wordNr, $#words); } # returns the tier name and the content of the ORT track given a block number sub getBlockInfo { my ($blockNr) = @_; my $i; #BUG removed. blocks are now 8 lines # for ($i = 0; $lines[$i] !~ /^$blockNr/ && $i < $#lines; $i += 6) {} for ($i = 0; $lines[$i] !~ /^$blockNr/ && $i < $#lines; $i += 8) {} my $tierName; my $ortContent; if ($lines[$i] =~ /^$blockNr/) { $tierName = $lines[$i]; $tierName =~ s/.* //; chop($tierName); $ortContent = substr($lines[$i + 1], 4); chop($ortContent); } return ($tierName, $ortContent); } sub printHitInfo { # The following info regarding the hit is printed: # # Transcription file name + path (the .sea file is returned) # Tier name # Track type # Utterance number (= Tag number) # Content of the Ort track # word (tuple) number of the hit # hit begin position in the content of the ORT track # hit end position in the content of the ORT track print "FN: $fileName\n"; ($tierName, $ortContent) = getBlockInfo($utteranceNrHit[0]); print "TN: $tierName\n"; print "TT: $pTrackLabel[0]\n"; print "UN: $utteranceNrHit[0]\n"; print "OC: $ortContent\n"; print "WN: $wordNrHit[0]\n"; # if ANCHOR was in ORT track show detailed hit position info else show position info on the word level if ($pTrackLabel[0] eq "ORT") { # correct for the ORT label $hitBeginPos = $hitBegin - 4; $hitEndPos =$hitEnd - 4; } else { my @words = split / /, $ortContent; $hitword = $words[$wordNrHit[0]]; $pre = join ' ', @words[0..$wordNrHit[0] - 1]; $hitBeginPos = (length $pre) + 1; $hitEndPos = $hitBeginPos + length $hitword; } print "HB: $hitBeginPos\n"; print "HE: $hitEndPos\n"; } sub prepro { $corpusDir = "Corpora/cgn/annot"; @preFileNames = fileNames($corpusDir, "sea"); $qfile = $ARGV[0]; if ($qfile eq "") { # read query definition from STDIN @qfile = (); do { $input = ; push @qfile, $input; } while ($input !~ /^BOOL/) } else { # read query definition from $qfile open(QFILE, $qfile); @qfile = ; close(QFILE); } @pName = (); @pRegExp = (); @pTrackLabel = (); @pScope = (); $nrOfPatterns = 0; for ($i = 0; $i <= $#qfile; $i += 5) { if ($qfile[$i] =~ /^BOOL: /) { $qfile[$i] =~ s/\n$//; ($qBool = $qfile[$i]) =~ s/^BOOL: //; } else { $qfile[$i] =~ s/\n$//; $qfile[$i + 1] =~ s/\n$//; $qfile[$i + 2] =~ s/\n$//; $qfile[$i + 3] =~ s/\n$//; push @pName, ($qfile[$i]); push @pRegExp, ($qfile[$i + 1]); push @pTrackLabel, ($qfile[$i + 2]); push @pScope, ($qfile[$i + 3]); if ($qfile[$i + 3] =~ /ANCHOR/) { $anchorPattern = $qfile[$i + 1]; } $nrOfPatterns++; } } # transform the query boolean into a perl evaluable form $queryBool = " ".$qBool." "; for ($i = 0; $i < $nrOfPatterns; $i++) { $queryBool =~ s/(\s|\!|$)$pName[$i](\s|$)/$1\$boolHit[$i]$2/g; $queryBool =~ s/(\s|\!|\()$pName[$i]\.unr/$1\$utteranceNrHit[$i]/g; $queryBool =~ s/(\s|\!|\()$pName[$i]\.wnr/$1\$wordNrHit[$i]/g; $queryBool =~ s/(\s|\!|\()$pName[$i]\.len/$1\$lengthHit[$i]/g; } $#boolHit = $nrOfPatterns; $#trackHit = $nrOfPatterns; $#utteranceNrHit = $nrOfPatterns; $#wordNrHit = $nrOfPatterns; $#lengthHit = $nrOfPatterns; $#hitIndex = $nrOfPatterns; $hits = 0; $files = 0; } sub fileNames { my ($path, $extension) = @_; opendir(DIR, $path); my @entries = readdir(DIR); close(DIR); my @files = (); # push @files, "/data/hsm-archive/eudico/corpora/cgn/annot/r1nl_01/sea/fn000001.sea"; # return @files; my $entry; foreach $entry (@entries) { if ($entry =~ /^\./) { next; } my $newPath = $path."/".$entry; if (-d $newPath) { push @files, fileNames($newPath, $extension); } if ($entry =~ /\.$extension/) { push @files, ("$newPath"); } } return @files; }