#!/usr/local/bin/perl

# make sure that all output is flushed.
$| = 1;

$startTime = time;

prepro();

print "STARTING SEARCH\n";
foreach $fileName (@preFileNames) {
	open(INFILE, $fileName);
	$files++;
	
	while ($line = <INFILE>) {
		# look for the anchor pattern
		if ($line =~ /$anchorPattern/) {
			# check if the pattern occurs in the right track
			if ($pTrackLabel[0] =~ /ANY TRACK/ || $line =~ /^$pTrackLabel[0]/) {
				search();  # take a closer look at this file
				last;      # and continue with the next file
			}
        	}
	}
	
	close(INFILE);
	
	# report progress
	$p = $files / ($#preFileNames + 1);
	print "PROGRESS $p\n";
}

$duration = time - $startTime;

print "found $hits hits in $files files in $duration seconds\n";
print "READY\n";


# do a search in a specific file from which we know that it contains the anchor pattern
sub search {
	# read the file into an array
	open(SEARCHFILE, $fileName);
	@lines = <SEARCHFILE>;
	close(SEARCHFILE);
	$nrOfLines = $#lines + 1;
	
	for($i = 0; $i < $nrOfLines; $i++) {
		# look for all occurences of the anchor pattern in this line (could be more than 1)
		while ($lines[$i] =~ /$anchorPattern/g) {
			# remember information regarding the position of this hit
			$hitBegin = length $`;
			$hitEnd = $hitBegin + length $&;
	
			# ignore the block nr. info lines if they accidently contain the anchor pattern
      			if ($lines[$i] =~ /^\d/) {
				next;
			}
			
			#check if the hit is in the right track and in the content part of the preprocessed data
			if ($pTrackLabel[0] =~ /ANY TRACK/ || index($lines[$i], $pTrackLabel[0]) == 0 && $hitBegin > 3) {
				# get the hit info for the anchor pattern
				$boolHit[0] = 1;
				$trackHit[0] = $lines[$i];
				($utteranceNrHit[0], $wordNrHit[0], $lengthHit[0]) = getPositionInfo($i, $hitBegin);
			
				# look for hits for the other patterns
				for ($j = 1; $j < $nrOfPatterns; $j++) {
					# find the first and last line of this patterns scope
					($from, $to) = findRange($utteranceNrHit[0], $pScope[$j]);	
					
					# get a list with all the hits for this pattern within its scope
					$hitsForPattern[$j] = rangeSearch($pRegExp[$j], $pTrackLabel[$j], $from, $to);	
				}
				
				# try to find a combination of hits that evaluates to a true queryBool
				evaluateHits();
			}
			
			# make sure the search for the anchor pattern continues at the right place
			pos $lines[$i] = $hitBegin + 1;
		} 
	}
}

# tries all combinations of hits for all the patterns until a true queryBool is found
sub evaluateHits {
	my $i;
	
	#initialize the hitIndices
	$hitIndex[1] = -1;
	for ($i = 2; $i < $nrOfPatterns; $i++) {
		$hitIndex[$i] = 0;
	}

	do {
		# calculate the next combination of hitIndices
		# this loop systematicaly runs through all possible combinations of hit indices
		for ($i = 1; $i < $nrOfPatterns; $i++) {
			if ($hitIndex[$i] < $#{$hitsForPattern[$i]}) {
				$hitIndex[$i]++;
				last;
			}
			else {
				$hitIndex[$i] = 0;
			}
		}
		
		# set all the queryBool parameters for the current combination of hits
		for ($i = 1; $i < $nrOfPatterns; $i++) {
			($boolHit[$i], $utteranceNrHit[$i], $wordNrHit[$i], $lengthHit[$i]) = @{$hitsForPattern[$i]->[$hitIndex[$i]]};
			
		}
		
		# if this combination gives a true queryBool then show the hit info 
		# and continue with the next ANCHOR pattern search.

		if (eval($queryBool)) {
			$hits++;
			printHitInfo();
			return;
		}
		
	}
	while (!triedAllCombinations());
}

# check if all combinations of hits are evaluated
sub triedAllCombinations {
	my $i;
	for ($i = 1; $i < $nrOfPatterns; $i++) {
		if ($hitIndex[$i] < $#{$hitsForPattern[$i]}) {
			return 0; # false
		}
	}

	return 1; # true
}

sub findRange {
	my ($anchorBlockNr, $scope) = @_;
	
	my $from = $scope;
	$from =~ s/ .*//;
	my $to = $scope;
	$to =~ s/.* //;  # scope mag geen spaties aan het einde hebben, dit moet beter!!!
	
	my $firstLine;
	if ($from =~ /\-X/) {
		$firstLine = 0;
	}
	else {
		my $firstBlockNr = $anchorBlockNr + $from; 
		if ($firstBlockNr < 0) {
			$firstBlockNr = 0;
		}
		
		my $i = 0;
		while ($lines[$i] !~ /^$firstBlockNr /) {
			$i++;
		}
		$firstLine = $i + 1;
	}
	
	my $lastLine;
	if ($to =~ /\+X/) {
		$lastLine = $#lines;
	}
	else {
		my $lastBlockNr = $anchorBlockNr + $to + 1; 
		
		my $i = 0;
		while ($i <= $#lines && $lines[$i] !~ /^$lastBlockNr /) {
			$i++;
		}
		$lastLine = $i - 1;
	}
	
	return ($firstLine, $lastLine);
}


sub rangeSearch { 
	my ($pattern, $trackLabel, $rangeBegin, $rangeEnd) = @_;
	
	my @hits = ();
	my $hitBegin;
	my $utteranceNr;
	my $utteranceLength;
	my $wordNr;
	my $i;
	for($i = $rangeBegin; $i <= $rangeEnd; $i++) {
		pos $lines[$i] = 0;
		while ($lines[$i] =~ /$pattern/g) {
			$hitBegin = length $`;
				
			if ($trackLabel =~ /ANY TRACK/ || index($lines[$i], $trackLabel) == 0) {
				($utteranceNr, $wordNr, $utteranceLength) = getPositionInfo($i, $hitBegin);
				push @hits, [1, $utteranceNr, $wordNr, $utteranceLength];
			}
			
			pos $lines[$i] = $hitBegin +  1;
		}
	}
	
	if ($#hits == -1) {
		push @hits, [0, "NOT FOUND\n", -1, -1];
	}
	
	return \@hits;
}


sub getPositionInfo {
	my ($lineNr, $hitBegin) = @_;
	
	# find the block number
	my $i = $lineNr;
	while ($lines[$i] !~ /^\d/) {
		$i--;
	}
	my $blockNr = $lines[$i];
	$blockNr =~ s/ .*//;
	chop($blockNr);
	
	# find the word number
	my @words = split /\s+/, $lines[$lineNr];
	my $length = 0;
	my $wordNr = 0;
	while ($length <= $hitBegin) {
		$length += (length $words[$wordNr++]) + 1;         # +1 because of the space characters
	}
	$subst = substr($lines[$lineNr], $hitBegin, 1);

	if (substr($lines[$lineNr], $hitBegin, 1) eq ' ') {
		$wordNr -= 1;
	}
	else {
		$wordNr -= 2;
	}

	return ($blockNr, $wordNr, $#words);
}

# returns the tier name and the content of the ORT track given a block number
sub getBlockInfo {
	my ($blockNr) = @_;

	my $i;
#BUG removed. blocks are now 8 lines
#	for ($i = 0; $lines[$i] !~ /^$blockNr/ && $i < $#lines; $i += 6) {}
	for ($i = 0; $lines[$i] !~ /^$blockNr/ && $i < $#lines; $i += 8) {}
	
	my $tierName;
	my $ortContent;
	if ($lines[$i] =~ /^$blockNr/) {
		$tierName = $lines[$i];
		$tierName =~ s/.* //;
		chop($tierName);
		$ortContent = substr($lines[$i + 1], 4);
		chop($ortContent);
	}
	
	return ($tierName, $ortContent);	
}

sub printHitInfo {
	# The following info regarding the hit is printed:
	#
	# Transcription file name + path (the .sea file is returned)
	# Tier name
	# Track type
	# Utterance number (= Tag number)
	# Content of the Ort track
	# word (tuple) number of the hit
	# hit begin position in the content of the ORT track
	# hit end position in the content of the ORT track
	
	print "FN: $fileName\n";
	
	($tierName, $ortContent) = getBlockInfo($utteranceNrHit[0]);
	
	print "TN: $tierName\n";
	print "TT: $pTrackLabel[0]\n";
	print "UN: $utteranceNrHit[0]\n";
	print "OC: $ortContent\n";
	print "WN: $wordNrHit[0]\n";
	
	# if ANCHOR was in ORT track show detailed hit position info else show position info on the word level
	if ($pTrackLabel[0] eq "ORT") {
		# correct for the ORT label
		$hitBeginPos = $hitBegin - 4;
		$hitEndPos =$hitEnd - 4;
	}
	else {
		my @words = split / /, $ortContent;
		$hitword = $words[$wordNrHit[0]];
		$pre = join ' ', @words[0..$wordNrHit[0] - 1];
		$hitBeginPos = (length $pre) + 1;
		$hitEndPos = $hitBeginPos + length $hitword;
	}	
		
	print "HB: $hitBeginPos\n";
	print "HE: $hitEndPos\n";
}

sub prepro {
	$corpusDir = "Corpora/cgn/annot";
	@preFileNames = fileNames($corpusDir, "sea");
	
	$qfile = $ARGV[0];
	if ($qfile eq "") {
		# read query definition from STDIN
		@qfile = ();
		do {
			$input = <STDIN>;
			push @qfile, $input;
		}
		while ($input !~ /^BOOL/) 
	}
	else {
		# read query definition from $qfile
		open(QFILE, $qfile);
		@qfile = <QFILE>;
		close(QFILE);
	}

	@pName = ();
	@pRegExp = ();
	@pTrackLabel = ();
	@pScope = ();
	$nrOfPatterns = 0;
	for ($i = 0; $i <= $#qfile; $i += 5) {
		if ($qfile[$i] =~ /^BOOL: /) {
			$qfile[$i] =~ s/\n$//;
			($qBool = $qfile[$i]) =~ s/^BOOL: //;
		}
		else {
			$qfile[$i] =~ s/\n$//;
			$qfile[$i + 1] =~ s/\n$//;
			$qfile[$i + 2] =~ s/\n$//;
			$qfile[$i + 3] =~ s/\n$//;
		
			push @pName, ($qfile[$i]);
			push @pRegExp, ($qfile[$i + 1]);
			push @pTrackLabel, ($qfile[$i + 2]);
			push @pScope, ($qfile[$i + 3]);
		
			if ($qfile[$i + 3] =~ /ANCHOR/) {
				$anchorPattern = $qfile[$i + 1];
			}
		
			$nrOfPatterns++;
		}
	}

	# transform the query boolean into a perl evaluable form
	$queryBool = " ".$qBool." ";
	for ($i = 0; $i < $nrOfPatterns; $i++) {
		$queryBool =~ s/(\s|\!|\()$pName[$i](\s|\))/$1\$boolHit[$i]$2/g;
		$queryBool =~ s/(\s|\!|\()$pName[$i]\.unr/$1\$utteranceNrHit[$i]/g;
		$queryBool =~ s/(\s|\!|\()$pName[$i]\.wnr/$1\$wordNrHit[$i]/g;
		$queryBool =~ s/(\s|\!|\()$pName[$i]\.len/$1\$lengthHit[$i]/g;
	}

	$#boolHit = $nrOfPatterns;
	$#trackHit = $nrOfPatterns;
	$#utteranceNrHit = $nrOfPatterns;
	$#wordNrHit = $nrOfPatterns;
	$#lengthHit = $nrOfPatterns;
	$#hitIndex = $nrOfPatterns;
	$hits = 0;
	$files = 0;
}


sub fileNames {
	my ($path, $extension) = @_;

	opendir(DIR, $path);
	my @entries = readdir(DIR);
	close(DIR);
	
	my @files = ();
#	push @files, "/data/hsm-archive/eudico/corpora/cgn/annot/r1nl_01/sea/fn000001.sea";
#	return @files;
	
	my $entry;
	foreach $entry (@entries) {
		if ($entry =~ /^\./) {
			next;
		}
		
		my $newPath = $path."/".$entry;
		
		if (-d $newPath) {
			push @files, fileNames($newPath, $extension);
		}
		
		if ($entry =~ /\.$extension/) {
			push @files, ("$newPath");
		}
	}

	return @files;
}


