/*
 * File:     Transcription2WordList.java
 * Project:  MPI Linguistic Application
 * Date:     12 December 2007
 *
 * Copyright (C) 2001-2008  Max Planck Institute for Psycholinguistics
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

package mpi.eudico.client.util;

import mpi.eudico.client.annotator.util.ClientLogger;

import mpi.eudico.server.corpora.clom.Annotation;
import mpi.eudico.server.corpora.clom.Transcription;

import mpi.eudico.server.corpora.clomimpl.abstr.TierImpl;
import mpi.eudico.server.corpora.clomimpl.abstr.TranscriptionImpl;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;

import java.nio.charset.UnsupportedCharsetException;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;


/**
 * Extracts unique words from a selection of tiers and writes the results to a
 * text file.
 *
 * @author Han Sloetjes
 * @version 1.0
 */
public class Transcription2WordList implements ClientLogger {
    /** Holds value of property DOCUMENT ME! */
    final private String NEWLINE = "\n";
    private String delimiters = " \t\n\r\f.,!?\"\'";

    /**
     * Creates a new Transcription2WordList instance
     */
    public Transcription2WordList() {
        super();
    }

    /**
     * Exports the unique words from a selection of tiers.<br>
     * Note: test shoulds be done with respect to performance (use of a
     * TreeSet instead of an ArrayList followed by a sort.
     *
     * @param transcription the transcription containing the tiers
     * @param tierNames a list of the selected tier names
     * @param exportFile the file to export to
     * @param charEncoding the encoding to use for the export file
     * @param delimiters the token delimiters
     *
     * @throws IOException if no file has been passed or when writing to the
     *         file fails
     */
    public void exportWords(Transcription transcription, List tierNames,
        File exportFile, String charEncoding, String delimiters)
        throws IOException {
        if (exportFile == null) {
            LOG.warning("No destination file specified for export");
            throw new IOException("No destination file specified for export");
        }

        if (transcription == null) {
            LOG.severe("No transcription specified for wordlist");

            return;
        }

        if (tierNames == null) {
            LOG.warning("No tiers specified for the wordlist: using all tiers");
            tierNames = new ArrayList(transcription.getTiers().size());

            for (int i = 0; i < transcription.getTiers().size(); i++) {
                tierNames.add(((TierImpl) (transcription.getTiers().get(i))).getName());
            }
        }

        if (delimiters != null) {
            this.delimiters = delimiters;
        }

        ArrayList uniqueWords = getUniqueWords(transcription, tierNames);
        Collections.sort(uniqueWords);

        // write the words
        FileOutputStream out = new FileOutputStream(exportFile);
        OutputStreamWriter osw = null;

        try {
            osw = new OutputStreamWriter(out, charEncoding);
        } catch (UnsupportedCharsetException uce) {
            osw = new OutputStreamWriter(out, "UTF-8");
        }

        BufferedWriter writer = new BufferedWriter(osw);

        for (int i = 0; i < uniqueWords.size(); i++) {
            writer.write((String) uniqueWords.get(i));
            writer.write(NEWLINE);
        }

        writer.close();
    }

    /**
     * Exports the unique words from a selection of tiers from a number of files.<br>
     * Note: test shoulds be done with respect to performance (use of a
     * TreeSet instead of an ArrayList followed by a sort.
     *
     * @param files a list of eaf files
     * @param tierNames a list of the selected tier names
     * @param exportFile the file to export to
     * @param charEncoding the encoding to use for the export file
     * @param delimiters the token delimiters
     *
     * @throws IOException if no file has been passed or when writing to the
     *         file fails
     */
    public void exportWords(List files, List tierNames, File exportFile,
        String charEncoding, String delimiters) throws IOException {
        if (exportFile == null) {
            LOG.warning("No destination file specified for export");
            throw new IOException("No destination file specified for export");
        }

        if ((files == null) || (files.size() == 0)) {
            LOG.warning("No files specified for export");
            throw new IOException("No files specified for export");
        }

        if (delimiters != null) {
            this.delimiters = delimiters;
        }

        ArrayList uniqueWords = new ArrayList();

        ArrayList transWords;
        File file;
        TranscriptionImpl trans;

        for (int i = 0; i < files.size(); i++) {
            file = (File) files.get(i);

            if (file == null) {
                continue;
            }

            try {
                trans = new TranscriptionImpl(file.getAbsolutePath());
                transWords = getUniqueWords(trans, tierNames);

                Object wo;

                for (int j = 0; j < transWords.size(); j++) {
                    wo = transWords.get(j);

                    if (!uniqueWords.contains(wo)) {
                        uniqueWords.add(wo);
                    }
                }
            } catch (Exception ex) {
                // catch any exception that could occur and continue
                LOG.severe("Could not handle file: " + file.getAbsolutePath());
                LOG.severe(ex.getMessage());
            }
        }

        Collections.sort(uniqueWords);

        // write the words
        FileOutputStream out = new FileOutputStream(exportFile);
        OutputStreamWriter osw = null;

        try {
            osw = new OutputStreamWriter(out, charEncoding);
        } catch (UnsupportedCharsetException uce) {
            osw = new OutputStreamWriter(out, "UTF-8");
        }

        BufferedWriter writer = new BufferedWriter(osw);

        for (int i = 0; i < uniqueWords.size(); i++) {
            writer.write((String) uniqueWords.get(i));
            writer.write(NEWLINE);
        }

        writer.close();
    }

    /**
     * Creates a list of unique words in the specified tiers from the specified
     * transcription.
     *
     * @param transcription the transcription
     * @param tierNames the tiers
     *
     * @return the words
     */
    private ArrayList getUniqueWords(Transcription transcription, List tierNames) {
        ArrayList uniqueWords = new ArrayList();

        if (transcription == null) {
            LOG.severe("No transcription specified to extract words from");

            return uniqueWords;
        }

        TierImpl t;
        ArrayList annos = new ArrayList();
        Annotation ann;
        String token;
        StringTokenizer tokenizer;

        if (tierNames != null) {
            for (int i = 0; i < tierNames.size(); i++) {
                t = (TierImpl) transcription.getTierWithId((String) tierNames.get(
                            i));

                if (t != null) {
                    annos.addAll(t.getAnnotations());
                } else {
                    LOG.warning("No tier with name: " + tierNames.get(i));
                }
            }
        } else {
            List tiers = transcription.getTiers();

            for (int i = 0; i < tiers.size(); i++) {
                t = (TierImpl) tiers.get(i);

                if (t != null) {
                    annos.addAll(t.getAnnotations());
                } else {
                    LOG.warning("No tier with name: " + tierNames.get(i));
                }
            }
        }

        for (int i = 0; i < annos.size(); i++) {
            ann = (Annotation) annos.get(i);

            if (ann != null) {
                if (ann.getValue().length() > 0) {
                    if (delimiters.length() > 0) {
                        tokenizer = new StringTokenizer(ann.getValue(),
                                delimiters);

                        while (tokenizer.hasMoreTokens()) {
                            token = tokenizer.nextToken();

                            if (!uniqueWords.contains(token)) {
                                uniqueWords.add(token);
                            }
                        }
                    } else {
                        if (!uniqueWords.contains(ann.getValue())) {
                            uniqueWords.add(ann.getValue());
                        }
                    }
                }
            } else {
                LOG.warning("Annotation is null");
            }
        }

        return uniqueWords;
    }
}
