#!/usr/bin/perl -w # version 5.0.3 last update on 29/11/2002 by # 17/09/2002 # added functionality to match only on tiers associated with a specific speaker # in that case the speaker codes must follow every search file spec seperated by a ';' # /data/corpora/CGN/ANNOTATIONS/Corpora/cgn/annot/r4nl_08/sea/fn000892.sea;N00011;N00012 # 29/11/2002 # function "findFiles" is out-sourced to external file # if name of Condition is followed by String "noregex", word boundaries are added to pattern # make sure that all output is flushed. use findFiles; $| = 1; $startTime = time; my $usage=' Usage: cgn-search.pl [queryfile] [arguments] -q quiet mode. No information of search status -s subset mode. File names are expected to follow query-information; reads from if specified, otherwise from STDIN In the latter case last file name must be followed by a line beginning with the word READY to terminate input -cp expects the CGN-Files in , if not set, built-in default is used '; @trackLabels=qw(NUM ORT POS LEM MAR PHO BEG END NIL); $nrOfTracks=$#trackLabels+1; $splitChars = "[-\\s.,:;_'`?!\\\[\\\]]"; prepro(); $quiet || print "STARTING SEARCH\n"; foreach $fileName (@preFileNames) { # print $fileName."\n"; search(); $files++; # report progress $quiet || print "PROGRESS ".$files/($#preFileNames + 1)."\n"; } $duration = time - $startTime; print "found $hits hits in $files files in $duration seconds\n"; $quiet || print "READY\n"; # do a search in a specific file sub search { # read the file into an array return unless -r $fileName; open(SEARCHFILE, $fileName); @lines = ; close(SEARCHFILE); $lastLineOfFile = $#lines; my $step = $pTrackNr[0]==0 ? 1 : $nrOfTracks; # search any track resp. only one if($subset){ $speakers_ref = $speakerlist[$files];} for($i = $pTrackNr[0]; $i <= $lastLineOfFile; $i+=$step){ if($subset){ if(@$speakers_ref > 0){ my $cc=1; $speaker = substr($lines[$i-$pTrackNr[0]],rindex($lines[$i-$pTrackNr[0]]," ")+1,-1); foreach $s (@$speakers_ref) { if($s eq $speaker){ $cc=0; last; }} if($cc){ next; } } } # look for all occurences of the anchor pattern in this line (could be more than 1) if($caseSensitive[0]){ while ($lines[$i] =~ /$anchorPattern/go){ # remember information regarding the position of this hit $hitBegin = length $`; if($noregex[0]){ $hitBegin++; $hitEnd = $hitBegin + $patternLength[0]; } else{ $hitEnd = $hitBegin + length $&; } #check if the hit is in the content part of the preprocessed data if ($hitBegin > 3) { # get the hit info for the anchor pattern $boolHit[0] = 1; ($utteranceNrHit[0], $wordNrHit[0]) = getPositionInfo($i, $hitBegin); # look for hits for the other patterns for ($j = 1; $j < $nrOfPatterns; $j++) { # get a list with all the hits for this pattern within its scope $hitsForPattern[$j] = rangeSearch($j, findRange($utteranceNrHit[0], $pScope[$j])); } # try to find a combination of hits that evaluates to a true queryBool evaluateHits(); } pos $lines[$i] = $hitBegin + 1; } } else{ while ($lines[$i] =~ /$anchorPattern/goi){ # remember information regarding the position of this hit $hitBegin = length $`; if($noregex[0]){ $hitBegin++; $hitEnd = $hitBegin + $patternLength[0]; } else{ $hitEnd = $hitBegin + length $&; } #check if the hit is in the content part of the preprocessed data if ($hitBegin > 3) { # get the hit info for the anchor pattern $boolHit[0] = 1; ($utteranceNrHit[0], $wordNrHit[0]) = getPositionInfo($i, $hitBegin); # look for hits for the other patterns for ($j = 1; $j < $nrOfPatterns; $j++) { # get a list with all the hits for this pattern within its scope $hitsForPattern[$j] = rangeSearch($j, findRange($utteranceNrHit[0], $pScope[$j])); } # try to find a combination of hits that evaluates to a true queryBool evaluateHits(); } pos $lines[$i] = $hitBegin + 1; } } } } # tries all combinations of hits for all the patterns until a true queryBool is found sub evaluateHits { my $i; #initialize the hitIndices $hitIndex[1] = -1; for ($i = 2; $i < $nrOfPatterns; $i++) { $hitIndex[$i] = 0; } do { # calculate the next combination of hitIndices # this loop systematicaly runs through all possible combinations of hit indices for ($i = 1; $i < $nrOfPatterns; $i++) { if ($hitIndex[$i] < $#{$hitsForPattern[$i]}) { $hitIndex[$i]++; last; } else { $hitIndex[$i] = 0; } } # set all the queryBool parameters for the current combination of hits for ($i = 1; $i < $nrOfPatterns; $i++) { ($boolHit[$i], $utteranceNrHit[$i], $wordNrHit[$i]) = @{$hitsForPattern[$i]->[$hitIndex[$i]]}; } # if this combination gives a true queryBool then show the hit info # and continue with the next ANCHOR pattern search. if (eval($queryBool)) { $hits++; printHitInfo(); return; } } while (!triedAllCombinations()); } # check if all combinations of hits are evaluated sub triedAllCombinations { my $i; for ($i = 1; $i < $nrOfPatterns; $i++) { if ($hitIndex[$i] < $#{$hitsForPattern[$i]}) { return 0; # false } } return 1; # true } sub findRange { my ($anchorBlockNr, $scope) = @_; my ($from, $to) = @$scope; my $firstLine; if ($from =~ /\-X/) { $firstLine = 0; } else { my $firstBlockNr = $anchorBlockNr + $from; $firstLine = $firstBlockNr*$nrOfTracks; if($firstLine < 0) { $firstLine = 0; } } my $lastLine; if ($to =~ /\+X/) { $lastLine = $lastLineOfFile; } else { my $lastBlockNr = $anchorBlockNr + $to + 1; $lastLine = $lastBlockNr*$nrOfTracks; if($lastLine > $lastLineOfFile) { $lastLine = $lastLineOfFile; } } return ($firstLine, $lastLine); } sub rangeSearch { my ($nrOfPattern, $rangeBegin, $rangeEnd) = @_; my @hits = (); my $hitBegin; my $utteranceNr; my $lengthHit; my $wordNr; my $pattern=$pRegExp[$nrOfPattern]; my $step = ($pTrackNr[$nrOfPattern]==0) ? 1 : $nrOfTracks; my $i; for($i = $rangeBegin+$pTrackNr[$nrOfPattern]; $i <= $rangeEnd; $i+=$step) { pos $lines[$i] = 0; if($caseSensitive[$nrOfPattern]){ while ($lines[$i] =~ /$pattern/g) { $hitBegin = length $`; if($noregex[$nrOfPattern]){ $hitBegin++; pos $lines[$i] = $hitBegin + $patternLength[$nrOfPattern]; } else{ pos $lines[$i] = $hitBegin + 1; } push @hits, [1, getPositionInfo($i, $hitBegin)]; } } else{ while ($lines[$i] =~ /$pattern/gi) { $hitBegin = length $`; if($noregex[$nrOfPattern]){ $hitBegin++; pos $lines[$i] = $hitBegin + $patternLength[$nrOfPattern]; } else{ pos $lines[$i] = $hitBegin + 1; } push @hits, [1, getPositionInfo($i, $hitBegin)]; } } } if ($#hits == -1) { push @hits, [0, "NOT FOUND\n", -1, -1]; } return \@hits; } sub getPositionInfo { my ($lineNr, $hitBegin) = @_; # find the block number my $blockNr=int($lineNr/$nrOfTracks); # find the word number my @wordsBeforeHit = split /\s+/, substr($lines[$lineNr],0,$hitBegin); my $wordNr = $#wordsBeforeHit; if (substr($lines[$lineNr], $hitBegin-1, 1) ne ' ') { $wordNr--; } return ($blockNr, $wordNr); } # returns the tier name and the content of the ORT track given a block number sub getBlockInfo { my ($blockNr) = @_; my $i = $blockNr*$nrOfTracks; my $tierName = substr($lines[$i],rindex($lines[$i]," ")+1); my $ortContent= substr($lines[$i + 1], 4); chop($tierName); chop($ortContent); return ($tierName, $ortContent); } sub printHitInfo { # The following info regarding the hit is printed: # # Transcription file name + path (the .sea file is returned) # Tier name # Track type # Utterance number (= Tag number) # Content of the Ort track # word (tuple) number of the hit # hit begin position in the content of the ORT track # hit end position in the content of the ORT track print "FN: $fileName\n"; ($tierName, $ortContent) = getBlockInfo($utteranceNrHit[0]); print "TN: $tierName\n"; print "TT: $trackLabels[$pTrackNr[0]]\n"; print "UN: $utteranceNrHit[0]\n"; print "OC: $ortContent\n"; print "WN: $wordNrHit[0]\n"; # if ANCHOR was in ORT track show detailed hit position info # else show position info on the word level if ($anchorIsORT) { # correct for the ORT label $hitBeginPos = $hitBegin - 4; $hitEndPos =$hitEnd - 4; } else { my @words = split / /, $ortContent; $hitword = $words[$wordNrHit[0]]; if($wordNrHit[0]==0){ $hitBeginPos=0; } else{ $pre = join ' ', @words[0..$wordNrHit[0] - 1]; $hitBeginPos = (length $pre) + 1; } $hitword=~s/\W+$//; # strip off interpunctation $hitEndPos = $hitBeginPos + length $hitword; } print "HB: $hitBeginPos\n"; print "HE: $hitEndPos\n"; my @begintimes = split / /, $lines[$utteranceNrHit[0]*$nrOfTracks+getTrackNr("BEG")]; my @endtimes = split / /, $lines[$utteranceNrHit[0]*$nrOfTracks+getTrackNr("END")]; print "BT: $begintimes[$wordNrHit[0]+1]\n"; print "ET: $endtimes[$wordNrHit[0]+1]\n"; } sub prepro { $corpusPath = "/data/corpora/CGN/ANNOTATIONS/Corpora/cgn/annot"; #looking for arguments for($i=0;$i<=$#ARGV;$i++){ if($ARGV[$i] !~ /^\-/) { $qfile=$ARGV[$i]; next; } if($ARGV[$i] eq "-q") { $quiet=1; next; } if($ARGV[$i] eq "-s") { $subset=1; next; } if($ARGV[$i] eq "-cp") { $corpusPath=$ARGV[++$i]; next; } die $usage; } if($subset) { $endkey="READY"; } else { $endkey="BOOL"; } if(defined($qfile)){ # read query definition from $qfile -r $qfile || die "Can't read query-file $qfile! \n"; open(QFILE, $qfile); @qfile = ; close(QFILE); } else{ # read query definition from STDIN @qfile = (); do { $input = ; push @qfile, $input; } while ($input !~ /^$endkey/) } @pName = (); @pRegExp = (); @pTrackNr = (); @pScope = (); $i=0; while($qfile[$i] !~ /^BOOL: /) { $qfile[$i] =~ s/\n$//; $qfile[$i + 1] =~ s/\n$//; $qfile[$i + 2] =~ s/\n$//; $qfile[$i + 3] =~ s/\n$//; if($qfile[$i] =~ s/\s+nocase$//){ $caseSensitive[$i/5] = 0; } else{ $caseSensitive[$i/5] = 1; } if($qfile[$i] =~ s/\s+noregex$//){ $noregex[$i/5] = 1; $patternLength[$i/5] = length $qfile[$i+1]; $qfile[$i+1]=~s/$/\\\(/g; $qfile[$i+1]=~s/$/\\\)/g; $qfile[$i+1] = $splitChars.$qfile[$i+1].$splitChars; } else{ $noregex[$i/5] = 0; } push @pName, ($qfile[$i]); push @pRegExp, ($qfile[$i + 1]); push @pTrackNr, (getTrackNr($qfile[$i + 2])); if ($qfile[$i + 3] =~ /ANCHOR/) { push @pScope, ([""]); $anchorPattern = $qfile[$i + 1]; } else{ my @qrange = split / /,$qfile[$i+3]; if($qrange[-1]=~/[aswu]/){ pop @qrange; if($& ne "u"){ $qrange[-2]="0"; $qrange[-1]="0"; } } $refQuery=$qfile[0]; if($qrange[0]=~/^(.*):/) { $refQuery=$1; shift @qrange; for($k = 1; $k < $#pName; $k++){ if($1 eq $pName[$k]){ $qrange[0]+=$pScope[$k][0]; $qrange[1]+=$pScope[$k][1]; } } } push @pScope, (\@qrange); } $i+=5; } $nrOfPatterns=$i/5; $qfile[$i] =~ s/\n$//; ($qBool = $qfile[$i]) =~ s/^BOOL: //; # transform the query boolean into a perl evaluable form $queryBool = " ".$qBool." "; for ($j = 0; $j < $nrOfPatterns; $j++) { $queryBool =~ s/(\s|\!|$)$pName[$j](\s|$)/$1\$boolHit[$j]$2/g; $queryBool =~ s/(\s|\!|\()$pName[$j]\.unr/$1\$utteranceNrHit[$j]/g; $queryBool =~ s/(\s|\!|\()$pName[$j]\.wnr/$1\$wordNrHit[$j]/g; } $#boolHit = $nrOfPatterns; $#utteranceNrHit = $nrOfPatterns; $#wordNrHit = $nrOfPatterns; $#hitIndex = $nrOfPatterns; $hits = 0; $files = 0; $anchorIsORT=($pTrackNr[0]==getTrackNr("ORT")); @preFileNames = (); @speakerlist = (); if($subset){ # continue reading qfile until EOF or key-word "READY" while($i<$#qfile){ $_=$qfile[++$i]; # ensure to have full file-name chomp(); if(/^READY/){ last; } # leave loop if(!$_) { next; } # skip empty lines $_=~s/^file:\/\///; # chop "file://" ($_,my @speakers) = split /;/; # read speakers for each file if(!/\.sea$/) { $_.=".sea"; } # append, if necessary ".sea" if(!/^(\/|\w:)/) { $_=~s/^/$corpusPath\// } # prepend, if necessary, corpusPath push @preFileNames, ("$_"); push @speakerlist, \@speakers; } } else{ @preFileNames = findFiles($corpusPath, "sea"); } } sub getTrackNr{ return 0 if($_[0] =~ /ANY TRACK/); my $i; for($i=1;$i<$nrOfTracks;$i++){ if($_[0] eq $trackLabels[$i]) { return $i; } } }