#!/bin/bash
# This script fills the search database and optionally Lucene N-gram index
# Should be run nightly with help of a cronjob. Takes 1-3 cores, few hours.
# NOTE: This assumes that the ingester is in /lat/tools/ingest_annex_search/

# The ingester expects the following JAR files in the same directory, as per
# JAR metadata classpath definition inside annot-search-lib-*.jar itself:

# annot-tools-*.jar corpusstructure-api-*.jar log4j-*.jar
# lucene-core-3.6.0.jar mpi-util-*.jar postgresql8jdbc3.jar

# It is unclear whether they are REALLY needed, but we also expect:
# xercesImpl-2.9.0.jar xml-apis-1.3.04.jar
# Additional fake corpusstructure-api + mpi-util dependencies can be ignored.



# This version includes the use of sudo. You may want to pick a suitable
# Linux user via cron already and then use ingester-no-sudo.sh instead.
export LANG=en_US.UTF-8
# Change to the directory where this script is: All other files are there.
cd $(dirname $0)
export LAT_JAVA=/lat/java/bin/java
sudo -u webuser $LAT_JAVA -Xnoclassgc -Xfuture \
  -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Dsun.jnu.encoding=UTF-8 \
  -Xmx1536M -jar annot-search-lib-1.4.1.jar SearchDBIngester.properties \
  > ingest.log 2> progress.log &

# NOTE: The log simply gets overwritten each day, no need to archive it.
# HOWEVER, corpman should check warnings about malformed archived files!

# Optionally add, for profiling, to the Java command line options:
# -agentlib:hprof=cpu=samples,thread=y,interval=100,depth=8,cutoff=0.005,file=hprof.txt

