#!/usr/bin/perl -w

# version 5.0.3 last update on 29/11/2002 by <alex.klassmann@mpi.nl>

# 17/09/2002
# added functionality to match only on tiers associated with a specific speaker
# in that case the speaker codes must follow every search file spec seperated by a ';'
# /data/corpora/CGN/ANNOTATIONS/Corpora/cgn/annot/r4nl_08/sea/fn000892.sea;N00011;N00012
# 29/11/2002
# function "findFiles" is out-sourced to external file
# if name of Condition is followed by String "noregex", word boundaries are added to pattern

# make sure that all output is flushed.
use findFiles;
$| = 1;

$startTime = time;

my $usage='
Usage: cgn-search.pl [queryfile] [arguments] 
  -q				 quiet mode. No information of search status
  -s				 subset mode. File names are expected to follow query-information;
					 reads from <queryfile> if specified, otherwise from STDIN
					 In the latter case last file name must be followed by a line 
					 beginning with the word READY to terminate input
  -cp <corpuspath>	 expects the CGN-Files in <corpuspath>, 
					 if not set, built-in default is used

';

@trackLabels=qw(NUM ORT POS LEM MAR PHO BEG END NIL);

$nrOfTracks=$#trackLabels+1;

$splitChars = "[-\\s.,:;_'`?!\\\[\\\]]";

prepro();

$quiet || print "STARTING SEARCH\n"; 

foreach $fileName (@preFileNames) {
	# print $fileName."\n";
	search();
   	$files++;

	# report progress
	$quiet || print "PROGRESS ".$files/($#preFileNames + 1)."\n";
}

$duration = time - $startTime;

print "found $hits hits in $files files in $duration seconds\n";

$quiet || print "READY\n"; 


# do a search in a specific file
sub search {
	# read the file into an array
	return unless -r $fileName;

	open(SEARCHFILE, $fileName);
	@lines = <SEARCHFILE>;
	close(SEARCHFILE);
	
	$lastLineOfFile = $#lines;
	my $step = $pTrackNr[0]==0 ? 1 : $nrOfTracks;       # search any track resp. only one 	

	if($subset){ $speakers_ref = $speakerlist[$files];}  

	for($i = $pTrackNr[0]; $i <= $lastLineOfFile; $i+=$step){
		if($subset){
			if(@$speakers_ref > 0){
				my $cc=1;
				$speaker = substr($lines[$i-$pTrackNr[0]],rindex($lines[$i-$pTrackNr[0]]," ")+1,-1);
				foreach $s (@$speakers_ref) { if($s eq $speaker){ $cc=0; last; }}
				if($cc){ next; }
			}
		}
		# look for all occurences of the anchor pattern in this line (could be more than 1)
		if($caseSensitive[0]){
			while ($lines[$i] =~ /$anchorPattern/go){
				# remember information regarding the position of this hit
				$hitBegin = length $`;
				if($noregex[0]){ 
					$hitBegin++; 
					$hitEnd = $hitBegin + $patternLength[0]; 
				}
				else{ 
					$hitEnd = $hitBegin + length $&;
				}
				
				#check if the hit is in the content part of the preprocessed data
				if ($hitBegin > 3) {
					# get the hit info for the anchor pattern
					$boolHit[0] = 1;
					($utteranceNrHit[0], $wordNrHit[0]) = getPositionInfo($i, $hitBegin);
					
					# look for hits for the other patterns
					for ($j = 1; $j < $nrOfPatterns; $j++) {
						
						# get a list with all the hits for this pattern within its scope
						$hitsForPattern[$j] = rangeSearch($j, findRange($utteranceNrHit[0], $pScope[$j]));	
					}
					
					# try to find a combination of hits that evaluates to a true queryBool
					evaluateHits();
				}
				pos $lines[$i] = $hitBegin + 1;
			} 
		}
		else{
			while ($lines[$i] =~ /$anchorPattern/goi){
				# remember information regarding the position of this hit
				$hitBegin = length $`;
				if($noregex[0]){ 
					$hitBegin++; 
					$hitEnd = $hitBegin + $patternLength[0]; 
				}
				else{ 
					$hitEnd = $hitBegin + length $&;
				}
				
				#check if the hit is in the content part of the preprocessed data
				if ($hitBegin > 3) {
					# get the hit info for the anchor pattern
					$boolHit[0] = 1;
					($utteranceNrHit[0], $wordNrHit[0]) = getPositionInfo($i, $hitBegin);
					
					# look for hits for the other patterns
					for ($j = 1; $j < $nrOfPatterns; $j++) {
						
						# get a list with all the hits for this pattern within its scope
						$hitsForPattern[$j] = rangeSearch($j, findRange($utteranceNrHit[0], $pScope[$j]));	
					}
					
					# try to find a combination of hits that evaluates to a true queryBool
					evaluateHits();
				}
				pos $lines[$i] = $hitBegin + 1;
			} 
		}
	}
}

# tries all combinations of hits for all the patterns until a true queryBool is found
sub evaluateHits {
	my $i;
	
	#initialize the hitIndices
	$hitIndex[1] = -1;
	for ($i = 2; $i < $nrOfPatterns; $i++) {
		$hitIndex[$i] = 0;
	}
	
	do {
		# calculate the next combination of hitIndices
		# this loop systematicaly runs through all possible combinations of hit indices
		for ($i = 1; $i < $nrOfPatterns; $i++) {
			if ($hitIndex[$i] < $#{$hitsForPattern[$i]}) {
				$hitIndex[$i]++;
				last;
			}
			else {
				$hitIndex[$i] = 0;
			}
		}
		
		# set all the queryBool parameters for the current combination of hits
		for ($i = 1; $i < $nrOfPatterns; $i++) {
			($boolHit[$i], $utteranceNrHit[$i], $wordNrHit[$i]) = @{$hitsForPattern[$i]->[$hitIndex[$i]]};
		}
		
		# if this combination gives a true queryBool then show the hit info 
		# and continue with the next ANCHOR pattern search.

		if (eval($queryBool)) {
			$hits++;
			printHitInfo();
			return;
		}
		
	}
	while (!triedAllCombinations());
}

# check if all combinations of hits are evaluated
sub triedAllCombinations {
	my $i;
	for ($i = 1; $i < $nrOfPatterns; $i++) {
		if ($hitIndex[$i] < $#{$hitsForPattern[$i]}) {
			return 0; # false
		}
	}

	return 1; # true
}

sub findRange {
	my ($anchorBlockNr, $scope) = @_;

	my ($from, $to) = @$scope;

	my $firstLine;
	if ($from =~ /\-X/) {
		$firstLine = 0;
	}
	else {
		my $firstBlockNr = $anchorBlockNr + $from; 
		$firstLine = $firstBlockNr*$nrOfTracks;
		if($firstLine < 0) { $firstLine = 0; }
	}
	
	my $lastLine;
	if ($to =~ /\+X/) {
		$lastLine = $lastLineOfFile;
	}
	else {
		my $lastBlockNr = $anchorBlockNr + $to + 1; 
		$lastLine = $lastBlockNr*$nrOfTracks;
		if($lastLine > $lastLineOfFile) { $lastLine = $lastLineOfFile; }
	}
	
	return ($firstLine, $lastLine);
}


sub rangeSearch { 
	my ($nrOfPattern, $rangeBegin, $rangeEnd) = @_;
	
	my @hits = ();
	my $hitBegin;
	my $utteranceNr;
	my $lengthHit;
	my $wordNr;

	my $pattern=$pRegExp[$nrOfPattern];
	my $step = ($pTrackNr[$nrOfPattern]==0) ? 1 : $nrOfTracks;

	my $i;
	for($i = $rangeBegin+$pTrackNr[$nrOfPattern]; $i <= $rangeEnd; $i+=$step) {
		pos $lines[$i] = 0;
		if($caseSensitive[$nrOfPattern]){
			while ($lines[$i] =~ /$pattern/g) {
				$hitBegin = length $`;
				if($noregex[$nrOfPattern]){
					$hitBegin++;
					pos $lines[$i] = $hitBegin + $patternLength[$nrOfPattern];
				}
				else{
					pos $lines[$i] = $hitBegin + 1;
				}
				
				push @hits, [1, getPositionInfo($i, $hitBegin)];
			}
		}
		else{
			while ($lines[$i] =~ /$pattern/gi) {
				$hitBegin = length $`;
				if($noregex[$nrOfPattern]){
					$hitBegin++;
					pos $lines[$i] = $hitBegin + $patternLength[$nrOfPattern];
				}
				else{
					pos $lines[$i] = $hitBegin + 1;
				}
				
				push @hits, [1, getPositionInfo($i, $hitBegin)];
			}
		}
	}
	
	if ($#hits == -1) {
		push @hits, [0, "NOT FOUND\n", -1, -1];
	}
	
	return \@hits;
}


sub getPositionInfo {
	my ($lineNr, $hitBegin) = @_;
	
	# find the block number
	my $blockNr=int($lineNr/$nrOfTracks);

	# find the word number
	my @wordsBeforeHit = split /\s+/, substr($lines[$lineNr],0,$hitBegin);
	my $wordNr = $#wordsBeforeHit;

	if (substr($lines[$lineNr], $hitBegin-1, 1) ne ' ') {
		$wordNr--;
	} 

	return ($blockNr, $wordNr);
}

# returns the tier name and the content of the ORT track given a block number
sub getBlockInfo {
	my ($blockNr) = @_;

	my $i         = $blockNr*$nrOfTracks;
	my $tierName  = substr($lines[$i],rindex($lines[$i]," ")+1);
	my $ortContent= substr($lines[$i + 1], 4);

	chop($tierName);
	chop($ortContent);
	
   	return ($tierName, $ortContent);	
}

sub printHitInfo {
	# The following info regarding the hit is printed:
	#
	# Transcription file name + path (the .sea file is returned)
	# Tier name
	# Track type
	# Utterance number (= Tag number)
	# Content of the Ort track
	# word (tuple) number of the hit
	# hit begin position in the content of the ORT track
	# hit end position in the content of the ORT track
	
	print "FN: $fileName\n";
	
	($tierName, $ortContent) = getBlockInfo($utteranceNrHit[0]);
	
	print "TN: $tierName\n";
	print "TT: $trackLabels[$pTrackNr[0]]\n";
	print "UN: $utteranceNrHit[0]\n";
	print "OC: $ortContent\n";
	print "WN: $wordNrHit[0]\n";
	
	# if ANCHOR was in ORT track show detailed hit position info
	# else show position info on the word level
	if ($anchorIsORT) {
		# correct for the ORT label
		$hitBeginPos = $hitBegin - 4;
		$hitEndPos =$hitEnd - 4;
	}
	else {
		my @words = split / /, $ortContent;
		$hitword = $words[$wordNrHit[0]];
		if($wordNrHit[0]==0){
			$hitBeginPos=0;
		}
		else{
			$pre = join ' ', @words[0..$wordNrHit[0] - 1];
			$hitBeginPos = (length $pre) + 1;
		}
		$hitword=~s/\W+$//;                                     # strip off interpunctation
		$hitEndPos = $hitBeginPos + length $hitword;
	}	
	
	print "HB: $hitBeginPos\n";
	print "HE: $hitEndPos\n";

	my @begintimes = split / /, $lines[$utteranceNrHit[0]*$nrOfTracks+getTrackNr("BEG")];
	my @endtimes   = split / /, $lines[$utteranceNrHit[0]*$nrOfTracks+getTrackNr("END")];
	print "BT: $begintimes[$wordNrHit[0]+1]\n";
	print "ET: $endtimes[$wordNrHit[0]+1]\n";
}

sub prepro {

	$corpusPath = "/data/corpora/CGN/ANNOTATIONS/Corpora/cgn/annot";

	#looking for arguments
	for($i=0;$i<=$#ARGV;$i++){
		if($ARGV[$i] !~ /^\-/) { $qfile=$ARGV[$i]; next; }
		if($ARGV[$i] eq "-q")  { $quiet=1;  next; }
		if($ARGV[$i] eq "-s")  { $subset=1; next; }
		if($ARGV[$i] eq "-cp") { $corpusPath=$ARGV[++$i]; next; }
		die $usage;
	}
	
 	if($subset) { $endkey="READY"; } else { $endkey="BOOL"; }

	if(defined($qfile)){
		# read query definition from $qfile
		-r $qfile || die "Can't read query-file $qfile! \n";
		open(QFILE, $qfile);
		@qfile = <QFILE>;
		close(QFILE);
	}
	else{
		# read query definition from STDIN
		@qfile = ();
		do {
			$input = <STDIN>;
			push @qfile, $input;
		}
		while ($input !~ /^$endkey/) 
		}		
	
	@pName = ();
	@pRegExp = ();
	@pTrackNr = ();
	@pScope = ();
	$i=0;
	while($qfile[$i] !~ /^BOOL: /) {
	
		$qfile[$i] =~ s/\n$//;
		$qfile[$i + 1] =~ s/\n$//;
		$qfile[$i + 2] =~ s/\n$//;
		$qfile[$i + 3] =~ s/\n$//;

		if($qfile[$i] =~ s/\s+nocase$//){
			$caseSensitive[$i/5] = 0;
		}
		else{
			$caseSensitive[$i/5] = 1;
		}
		if($qfile[$i] =~ s/\s+noregex$//){
			$noregex[$i/5] = 1;
			$patternLength[$i/5] = length $qfile[$i+1];
			$qfile[$i+1]=~s/\(/\\\(/g;
			$qfile[$i+1]=~s/\)/\\\)/g;
            $qfile[$i+1] = $splitChars.$qfile[$i+1].$splitChars;
		}
		else{
			$noregex[$i/5] = 0;
		}

		push @pName,       ($qfile[$i]);
		push @pRegExp,     ($qfile[$i + 1]);
		push @pTrackNr,    (getTrackNr($qfile[$i + 2]));
		if ($qfile[$i + 3] =~ /ANCHOR/) {
			push @pScope, ([""]);
			$anchorPattern = $qfile[$i + 1];
		}
		else{
			my @qrange = split / /,$qfile[$i+3];
			if($qrange[-1]=~/[aswu]/){
				pop @qrange;
				if($& ne "u"){ 
					$qrange[-2]="0";
					$qrange[-1]="0";
				} 
			}
			$refQuery=$qfile[0];
			if($qrange[0]=~/^(.*):/) {
				$refQuery=$1; 
				shift @qrange;
				for($k = 1; $k < $#pName; $k++){
					if($1 eq $pName[$k]){
						$qrange[0]+=$pScope[$k][0];
						$qrange[1]+=$pScope[$k][1];
					}
				}
			} 
			push @pScope, (\@qrange);			
		}
		
		$i+=5;
	}
	$nrOfPatterns=$i/5;
	$qfile[$i] =~ s/\n$//;

	($qBool = $qfile[$i]) =~ s/^BOOL: //;
	
	# transform the query boolean into a perl evaluable form
	$queryBool = " ".$qBool." ";
	for ($j = 0; $j < $nrOfPatterns; $j++) {
		$queryBool =~ s/(\s|\!|\()$pName[$j](\s|\))/$1\$boolHit[$j]$2/g;
		$queryBool =~ s/(\s|\!|\()$pName[$j]\.unr/$1\$utteranceNrHit[$j]/g;
		$queryBool =~ s/(\s|\!|\()$pName[$j]\.wnr/$1\$wordNrHit[$j]/g;
	}

	$#boolHit        = $nrOfPatterns;
	$#utteranceNrHit = $nrOfPatterns;
	$#wordNrHit      = $nrOfPatterns;
	$#hitIndex       = $nrOfPatterns;
	$hits  = 0;
	$files = 0;

	$anchorIsORT=($pTrackNr[0]==getTrackNr("ORT"));

	@preFileNames = ();
	@speakerlist = ();

	if($subset){
		# continue reading qfile until EOF or key-word "READY"
		while($i<$#qfile){
			$_=$qfile[++$i];
			# ensure to have full file-name
			chomp();						
			if(/^READY/){ last; }			      # leave loop
			if(!$_) { next; }				      # skip empty lines
			$_=~s/^file:\/\///;				      # chop "file://"
			($_,my @speakers) = split /;/;        # read speakers for each file
			if(!/\.sea$/) { $_.=".sea"; }	      # append, if necessary ".sea"
			if(!/^(\/|\w:)/) { $_=~s/^/$corpusPath\// } # prepend, if necessary, corpusPath
			push @preFileNames, ("$_"); 
			push @speakerlist, \@speakers;

		}
	}
	else{
		@preFileNames = findFiles($corpusPath, "sea");
	}
}

sub getTrackNr{
	return 0 if($_[0] =~ /ANY TRACK/);       
	my $i;
	for($i=1;$i<$nrOfTracks;$i++){
		if($_[0] eq $trackLabels[$i]) { return $i; }
	}
}
