/*
 * Decompiled with CFR 0.152.
 */
package nl.mpi.annot.search.lib;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Date;
import nl.mpi.annot.search.lib.LuceneNGramAnalyzer;
import nl.mpi.annot.search.lib.LuceneNGramTokenizer;
import nl.mpi.annot.tools.data.AnnexAnnotation;
import nl.mpi.annot.tools.data.AnnexTier;
import nl.mpi.annot.tools.data.AnnexTranscription;
import nl.mpi.corpusstructure.NodeIdUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;

public class LuceneIndexWriter {
    private static Logger _logger = Logger.getLogger((String)LuceneIndexWriter.class.getName());
    public static final int MAX_NGRAM_LEN = 5;
    public static final int MAX_TIERS_PER_NODE = 256;
    private final int _maxNgramLength;
    private final boolean _withPositions;
    private final int _withFrequencies;
    private int _fileCount = 0;
    private int _tierCount = 0;
    private long _charCount = 0L;
    final IndexWriter _writer;
    final Tokenizer[] _tokenizers;

    public LuceneIndexWriter(File indexDirectory, boolean update, int maxNgramLength, boolean withPositions, int maxNgramFrequencies) throws IOException {
        this._maxNgramLength = maxNgramLength;
        this._withPositions = withPositions;
        this._withFrequencies = maxNgramFrequencies;
        FSDirectory dir = NIOFSDirectory.open((File)indexDirectory);
        LuceneNGramAnalyzer analyzer = new LuceneNGramAnalyzer(this._maxNgramLength, this._maxNgramLength);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, (Analyzer)analyzer);
        this._tokenizers = new Tokenizer[this._maxNgramLength];
        for (int i = 1; i < this._maxNgramLength; ++i) {
            this._tokenizers[i] = new LuceneNGramTokenizer(new StringReader("deadbeef"), i, i);
        }
        if (update) {
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        } else {
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        }
        this._writer = new IndexWriter((Directory)dir, iwc);
    }

    public void close() throws IOException {
        _logger.info((Object)("# Statistics: " + this._fileCount + " files, " + this._tierCount + " tiers, " + this._charCount + " chars processed"));
        this._writer.close();
        System.err.println("Lucene index closed (" + this._fileCount + " files, " + this._tierCount + " tiers, " + this._charCount + " chars)");
    }

    public void indexTranscription(AnnexTranscription transcription, File filesystemLocation) throws IOException {
        long nodeNumber;
        if (transcription == null || !transcription.isValid()) {
            _logger.error((Object)("Skipped file, parse failed: " + filesystemLocation.getPath()));
            return;
        }
        int nTiers = transcription.getTiers().size();
        if (nTiers == 0) {
            _logger.error((Object)("Skipped file, empty text body: " + filesystemLocation.getPath()));
            return;
        }
        int averageTierSize = (int)filesystemLocation.length() / nTiers;
        if (averageTierSize < 1000) {
            averageTierSize = 1000;
        }
        if ((nodeNumber = (long)NodeIdUtils.TOINT((String)transcription.getNodeId())) >= 0x800000L) {
            _logger.warn((Object)("NodeID >= 2^23, non-int32 TierID for: " + transcription.getNodeId()));
        }
        if (nTiers > 64) {
            _logger.info((Object)("Many tiers: " + nTiers + " tiers for: " + transcription.getNodeId()));
            if (nTiers > 256) {
                _logger.error((Object)("More than 256 tiers, not indexing TierID for: " + transcription.getNodeId() + " (" + nTiers + " tiers)"));
            }
        }
        for (int i = 0; i < nTiers; ++i) {
            Document doc = new Document();
            AnnexTier tier = (AnnexTier)transcription.getTiers().get(i);
            int nAnnotations = tier.annotations.size();
            if (nAnnotations > Short.MAX_VALUE) {
                _logger.warn((Object)("Tier with > 32k annotations: " + nAnnotations + " in '" + tier.name + "' in: " + transcription.getNodeId() + " (" + nTiers + " tiers) File: " + filesystemLocation.getAbsolutePath()));
            }
            Field nodeIdField = new Field("nodeid", transcription.getNodeId(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
            nodeIdField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY);
            doc.add((Fieldable)nodeIdField);
            if (nTiers <= 256) {
                long tierNumber = nodeNumber * 256L + (long)i;
                Field tierIdField = new Field("tierid", Long.toString(tierNumber), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
                tierIdField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY);
                doc.add((Fieldable)tierIdField);
            }
            if (i == 0) {
                Field pathField = new Field("path", filesystemLocation.getAbsolutePath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
                pathField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY);
                doc.add((Fieldable)pathField);
            }
            Field tnField = new Field("tiername", tier.name, Field.Store.YES, Field.Index.NOT_ANALYZED);
            tnField.setIndexOptions(this._withFrequencies >= 0 ? FieldInfo.IndexOptions.DOCS_AND_FREQS : FieldInfo.IndexOptions.DOCS_ONLY);
            doc.add((Fieldable)tnField);
            Field ttField = new Field("tiertype", tier.type, Field.Store.YES, Field.Index.NOT_ANALYZED);
            ttField.setIndexOptions(this._withFrequencies >= 0 ? FieldInfo.IndexOptions.DOCS_AND_FREQS : FieldInfo.IndexOptions.DOCS_ONLY);
            doc.add((Fieldable)ttField);
            StringBuilder tierContent = new StringBuilder(averageTierSize);
            for (int j = 0; j < nAnnotations; ++j) {
                AnnexAnnotation annotation = (AnnexAnnotation)tier.annotations.get(j);
                tierContent.append(annotation.value);
                if (j == nAnnotations - 1) continue;
                tierContent.append(' ');
            }
            this._charCount += (long)tierContent.length();
            String tierContentString = tierContent.toString();
            Field contentField = new Field("contents", tierContentString, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO);
            if (this._withPositions) {
                contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
            } else if (this._withFrequencies >= this._maxNgramLength) {
                contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS);
            } else {
                contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY);
            }
            doc.add((Fieldable)contentField);
            for (int j = 1; j < this._maxNgramLength; ++j) {
                this._tokenizers[j].reset((Reader)new StringReader(tierContentString));
                Field shortField = new Field(Integer.toString(j) + "grams", (TokenStream)this._tokenizers[j]);
                if (this._withFrequencies >= j) {
                    shortField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS);
                } else {
                    shortField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY);
                }
                doc.add((Fieldable)shortField);
            }
            ++this._tierCount;
            if (this._writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
                this._writer.addDocument(doc);
                continue;
            }
            this._writer.updateDocument(new Term("path", filesystemLocation.getAbsolutePath()), doc);
        }
        ++this._fileCount;
        if (this._fileCount % 100 == 0) {
            _logger.debug((Object)("Progress: " + this._fileCount + " files, " + this._tierCount + " tiers, " + this._charCount + " chars processed"));
        } else if (this._fileCount % 1000 == 0) {
            _logger.info((Object)("Progress: " + this._fileCount + " files, " + this._tierCount + " tiers, " + this._charCount + " chars processed :-)"));
        }
    }

    void walkDirectory(File file) throws IOException {
        int fileType;
        if (!file.canRead()) {
            return;
        }
        if (file.isDirectory()) {
            String dirName = file.getName();
            if ("sessions".equals(dirName) || "corpman".equals(dirName) || "Metadata".equals(dirName) || "Media".equals(dirName) || "Corpusstructure".equals(dirName)) {
                _logger.debug((Object)("Skipped directory: " + file.getPath()));
                return;
            }
            if ("infoFiles".equals(dirName) || "TRASH".equals(dirName) || "replaced-files".equals(dirName) || "eaf_old".equals(dirName) || "tst-cgn".equals(dirName)) {
                _logger.debug((Object)("Skipped special directory: " + file.getPath()));
                return;
            }
            String[] files = file.list();
            if (files != null) {
                for (int i = 0; i < files.length; ++i) {
                    this.walkDirectory(new File(file, files[i]));
                }
            }
            return;
        }
        String fileName = file.getName();
        if (fileName.endsWith(".cha")) {
            fileType = 1;
        } else if (fileName.endsWith(".txt")) {
            fileType = 3;
        } else if (fileName.endsWith(".sht") || fileName.endsWith(".tbt")) {
            fileType = 2;
        } else if (fileName.endsWith(".html") || fileName.endsWith(".htm")) {
            fileType = 4;
        } else if (fileName.endsWith(".xml") || fileName.endsWith(".rt")) {
            fileType = 5;
        } else if (fileName.endsWith(".csv")) {
            fileType = 6;
        } else if (fileName.endsWith(".eaf")) {
            fileType = 0;
        } else if (fileName.endsWith(".pdf")) {
            fileType = 7;
        } else if (fileName.endsWith(".srt")) {
            fileType = 8;
        } else if (fileName.endsWith(".TextGrid")) {
            fileType = 9;
        } else {
            int fileType2 = -1;
            if (file.getPath().indexOf("/Annotations/") != -1) {
                if (file.getPath().endsWith("tr")) {
                    _logger.info((Object)("Skipped *.*tr file, not indexable: " + file.getPath()));
                } else {
                    _logger.warn((Object)("Skipped file, not indexable: " + file.getPath()));
                }
            }
            return;
        }
        if (file.getPath().indexOf("/Annotations/") == -1 && file.getPath().indexOf("/eaf/") == -1) {
            if (file.getPath().indexOf("/CGN/") == -1) {
                _logger.info((Object)("Skipped file, not in Annotations (or CGN eaf) dir: " + file.getPath()));
            } else {
                _logger.debug((Object)("Skipped CGN file, not in Annotations or CGN eaf dir: " + file.getPath()));
            }
            return;
        }
        long fileSize = file.length();
        if (fileSize > 0x1700000L) {
            _logger.error((Object)("Skipped file, " + fileSize + " bytes (more than 23 MB) in size: " + file.getPath()));
            return;
        }
        AnnexTranscription transcription = null;
        try {
            transcription = new AnnexTranscription(NodeIdUtils.TONODEID((int)(4000000 + this._fileCount)), fileType, file);
        }
        catch (RuntimeException re) {
            _logger.error((Object)("Skipped file, parse error: " + file.getPath() + " Error: " + re));
            return;
        }
        this.indexTranscription(transcription, file);
    }

    public static void main(String[] args) {
        File docDir;
        String usage = "java " + LuceneIndexWriter.class.getName() + " [-ngramsize MAX_NGRAM_LENGTH] [-indexpositions] [-indexfrequencies MAX_LENGTH]" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index in\n" + "INDEX_PATH that can be searched with the SearchFiles Lucene demo etc.";
        String indexPath = null;
        String docsPath = null;
        boolean create = true;
        int max_ngram_len = 5;
        boolean indexPositions = false;
        int indexFrequencies = 0;
        for (int i = 0; i < args.length; ++i) {
            if ("-index".equals(args[i])) {
                indexPath = args[i + 1];
                ++i;
                continue;
            }
            if ("-docs".equals(args[i])) {
                docsPath = args[i + 1];
                ++i;
                continue;
            }
            if ("-update".equals(args[i])) {
                create = false;
                continue;
            }
            if ("-indexpositions".equals(args[i])) {
                indexPositions = true;
                continue;
            }
            if ("-indexfrequencies".equals(args[i])) {
                indexFrequencies = Integer.parseInt(args[i + 1]);
                ++i;
                continue;
            }
            if (!"-ngramsize".equals(args[i])) continue;
            max_ngram_len = Integer.parseInt(args[i + 1]);
            ++i;
        }
        if (docsPath == null || indexPath == null) {
            System.err.println("Usage: " + usage);
            System.exit(1);
        }
        if (!(docDir = new File(docsPath)).exists() || !docDir.canRead()) {
            _logger.error((Object)("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"));
            System.err.println("Document directory does not exist or is not readable!");
            System.exit(1);
        }
        StringBuilder initMessage = new StringBuilder(128);
        initMessage.append("Indexing N-grams up to length: ");
        initMessage.append(max_ngram_len);
        initMessage.append(" (with" + (indexPositions ? "" : "out"));
        initMessage.append(" positions,");
        initMessage.append(" with" + (indexFrequencies >= 0 ? "" : "out"));
        initMessage.append(" frequencies");
        initMessage.append(indexFrequencies > 0 ? " for 1.." + indexFrequencies + "-grams" : "");
        initMessage.append(") to directory: ");
        initMessage.append(indexPath);
        System.err.println(initMessage.toString());
        _logger.warn((Object)initMessage.toString());
        Date start = new Date();
        try {
            LuceneIndexWriter liw = new LuceneIndexWriter(new File(indexPath), !create, max_ngram_len, indexPositions, indexFrequencies);
            liw.walkDirectory(docDir);
            liw.close();
        }
        catch (IOException e) {
            _logger.error((Object)("caught a " + e.getClass() + " saying " + e.getMessage()), (Throwable)e);
        }
        Date end = new Date();
        System.err.println("Elapsed time: " + (end.getTime() - start.getTime()) + " total milliseconds");
        _logger.info((Object)("Done, max_ngram_length = " + max_ngram_len + ", elapsed time: " + (end.getTime() - start.getTime()) + " msec"));
    }
}

