/*
 * Decompiled with CFR 0.152.
 */
package nl.mpi.annot.tools.data;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import nl.mpi.annot.tools.data.AnnexAnnotation;
import nl.mpi.annot.tools.data.AnnexTier;
import nl.mpi.annot.tools.data.AnnexTranscription;
import org.apache.log4j.Logger;

public class PDFParser {
    private static final boolean _keepEmpty = false;
    private static Logger _logger = Logger.getLogger((String)PDFParser.class.getName());

    private static void parseText(BufferedReader input, AnnexTranscription transcription, String filename) throws IOException {
        int rawLineNumber = 0;
        boolean htmlHeader = false;
        boolean literal = true;
        AnnexTier textTier = null;
        AnnexTier placeTier = null;
        long time = 0L;
        int pageNumber = 1;
        int lineNumber = 1;
        transcription.setInfo("Title: (none)");
        boolean inTag = false;
        String line;
        while ((line = input.readLine()) != null) {
            ++rawLineNumber;
            if (line.startsWith("\f")) {
                ++pageNumber;
                lineNumber = 1;
                line = line.substring(1);
            }
            line = line.trim();
            if (rawLineNumber == 1 && line.equals("<html>")) {
                htmlHeader = true;
                literal = false;
                continue;
            }
            if (htmlHeader && !literal && (line.equals("<head>") || line.equals("</head>") || line.equals("<body>") || line.equals("</body>") || line.equals("</html>"))) continue;
            if (htmlHeader && !literal && line.startsWith("<title>")) {
                String title = line.substring(7);
                int endTitle = title.indexOf("</title>");
                if (endTitle != -1) {
                    title = title.substring(0, endTitle);
                }
                transcription.setInfo("Title: " + title);
                continue;
            }
            if (htmlHeader && !literal && line.startsWith("<meta name=\"")) {
                String part = line.substring(12);
                if ((part = part.replaceFirst("\" content=\"", ": ")).endsWith("\">")) {
                    part = part.substring(0, part.length() - 2);
                } else {
                    inTag = true;
                }
                transcription.setInfo(transcription.getInfo() + "\n" + part);
                continue;
            }
            if (htmlHeader && !literal && line.equals("<pre>")) {
                literal = true;
                continue;
            }
            if (!literal) {
                if (inTag) {
                    String part = line.trim();
                    if (part.endsWith("\">")) {
                        part = part.substring(0, part.length() - 2);
                        inTag = false;
                    } else {
                        inTag = true;
                    }
                    _logger.debug((Object)("Header tag continuation in line " + rawLineNumber + ": '" + line + "' in: " + filename));
                    transcription.setInfo(transcription.getInfo().trim() + " " + part);
                    continue;
                }
                _logger.warn((Object)("Ignored extra" + (htmlHeader ? " HTML" : "") + " header in line " + rawLineNumber + ": '" + line + "' in: " + filename));
                continue;
            }
            if (htmlHeader && literal && line.equals("</pre>")) {
                literal = false;
                continue;
            }
            if (textTier == null) {
                textTier = new AnnexTier("pdftext", "text");
                textTier.participant = "unknown";
                textTier.hasVirtualTime = true;
                transcription.getTiers().add(textTier);
                placeTier = new AnnexTier("pdflocation", "text");
                placeTier.participant = "unknown";
                placeTier.hasVirtualTime = true;
                transcription.getTiers().add(placeTier);
            }
            line = line.replaceAll("\\s+", " ").trim();
            long endTime = time + 100L * (long)line.length();
            if (line.length() > 0) {
                if (line.indexOf(8486) != -1) {
                    line = line.replace('\u2126', '\u03a9');
                }
                if (line.length() > 0) {
                    char first = line.charAt(0);
                    switch (first) {
                        case '\u0301': {
                            line = '\u00b4' + line.substring(1);
                            break;
                        }
                        case '\u0300': {
                            line = '`' + line.substring(1);
                            break;
                        }
                        case '\u0304': 
                        case '\u0306': 
                        case '\u031a': 
                        case '\u0328': {
                            line = ' ' + line;
                            break;
                        }
                        case '\u0331': {
                            line = '_' + line.substring(1);
                            break;
                        }
                        case '\u0336': {
                            line = '-' + line.substring(1);
                            break;
                        }
                        case '\u0303': {
                            line = '~' + line.substring(1);
                        }
                    }
                }
                textTier.annotations.add(new AnnexAnnotation(line, time, endTime, false, "Line" + rawLineNumber));
                String place = "page=" + pageNumber + "&line=" + lineNumber;
                placeTier.annotations.add(new AnnexAnnotation(place, time, endTime, false, "Line" + rawLineNumber));
            }
            time = endTime;
            ++lineNumber;
        }
        return;
    }

    public static void parsePDF(File file, AnnexTranscription transcription) throws IOException {
        BufferedReader input = null;
        Process pdftotext = null;
        long startTime = System.currentTimeMillis();
        try {
            ArrayList<String> exec = new ArrayList<String>(12);
            exec.add("pdftotext");
            exec.add("-eol");
            exec.add("unix");
            exec.add("-enc");
            exec.add("UTF-8");
            exec.add("-layout");
            exec.add("-upw");
            exec.add("secret");
            exec.add("-htmlmeta");
            exec.add(file.getCanonicalPath());
            exec.add("-");
            ProcessBuilder pdftotextRunner = new ProcessBuilder(exec);
            pdftotextRunner.redirectErrorStream(true);
            pdftotext = pdftotextRunner.start();
            input = new BufferedReader(new InputStreamReader(pdftotext.getInputStream(), "UTF-8"), 4096);
            PDFParser.parseText(input, transcription, file.getCanonicalPath());
        }
        catch (IOException ioe) {
            _logger.error((Object)("Failed to run pdftotext for file: " + file));
            throw ioe;
        }
        finally {
            if (input != null) {
                try {
                    input.close();
                }
                catch (IOException ioe2) {
                    _logger.warn((Object)("IOException when closing pipeline: " + ioe2));
                }
            }
            while (pdftotext != null) {
                try {
                    int retval = pdftotext.waitFor();
                    pdftotext = null;
                    if (retval == 0) continue;
                    _logger.warn((Object)("pdftotext ended with non-zero exit value: " + retval + " for file: " + file));
                    throw new IOException("pdftotext could not process file: " + retval);
                }
                catch (InterruptedException ie) {
                }
            }
        }
        long duration = System.currentTimeMillis() - startTime;
        if (duration > 12500L) {
            _logger.warn((Object)("pdftotext took " + duration + " msec to parse: " + file));
        } else if (duration > 2500L) {
            _logger.info((Object)("pdftotext took " + duration + " msec to parse: " + file));
        } else if (duration > 250L) {
            _logger.debug((Object)("pdftotext took " + duration + " msec to parse: " + file));
        }
    }

    public static void main(String[] args) throws IOException {
        File file = new File(args[0]);
        String id = args[0].substring(args[0].lastIndexOf(47) + 1);
        new AnnexTranscription(id, 7, file);
    }
}

