/*
 * Decompiled with CFR 0.152.
 */
package opennlp.tools.tokenize;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.maxent.ContextGenerator;
import opennlp.maxent.DataIndexer;
import opennlp.maxent.EventCollector;
import opennlp.maxent.EventCollectorAsStream;
import opennlp.maxent.EventStream;
import opennlp.maxent.GIS;
import opennlp.maxent.GISModel;
import opennlp.maxent.MaxentModel;
import opennlp.maxent.TwoPassDataIndexer;
import opennlp.maxent.io.SuffixSensitiveGISModelWriter;
import opennlp.tools.tokenize.TokContextGenerator;
import opennlp.tools.tokenize.TokEventCollector;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.ObjectIntPair;
import opennlp.tools.util.Span;

public class TokenizerME
implements Tokenizer {
    private MaxentModel model;
    private final ContextGenerator cg = new TokContextGenerator();
    private static final Double ONE = new Double(1.0);
    public static Pattern alphaNumeric = Pattern.compile("^[A-Za-z0-9]+$");
    private boolean ALPHA_NUMERIC_OPTIMIZATION;
    private List tokProbs;
    private List newTokens;

    public TokenizerME(MaxentModel mod) {
        this.setAlphaNumericOptimization(false);
        this.model = mod;
        this.newTokens = new ArrayList();
        this.tokProbs = new ArrayList(50);
    }

    public double[] getTokenProbabilities() {
        double[] tokProbArray = new double[this.tokProbs.size()];
        for (int i = 0; i < tokProbArray.length; ++i) {
            tokProbArray[i] = (Double)this.tokProbs.get(i);
        }
        return tokProbArray;
    }

    public Span[] tokenizePos(String d) {
        Span[] tokens = TokenizerME.split(d);
        this.newTokens.clear();
        this.tokProbs.clear();
        for (Span s : tokens) {
            String tok = d.substring(s.getStart(), s.getEnd());
            if (tok.length() < 2) {
                this.newTokens.add(s);
                this.tokProbs.add(ONE);
                continue;
            }
            if (this.useAlphaNumericOptimization() && alphaNumeric.matcher(tok).matches()) {
                this.newTokens.add(s);
                this.tokProbs.add(ONE);
                continue;
            }
            int start = s.getStart();
            int end = s.getEnd();
            int origStart = s.getStart();
            double tokenProb = 1.0;
            for (int j = origStart + 1; j < end; ++j) {
                double[] probs = this.model.eval(this.cg.getContext((Object)new ObjectIntPair(tok, j - origStart)));
                String best = this.model.getBestOutcome(probs);
                tokenProb *= probs[this.model.getIndex(best)];
                if (!best.equals("T")) continue;
                this.newTokens.add(new Span(start, j));
                this.tokProbs.add(new Double(tokenProb));
                start = j;
                tokenProb = 1.0;
            }
            this.newTokens.add(new Span(start, end));
            this.tokProbs.add(new Double(tokenProb));
        }
        Span[] spans = new Span[this.newTokens.size()];
        this.newTokens.toArray(spans);
        return spans;
    }

    public String[] tokenize(String s) {
        Span[] spans = this.tokenizePos(s);
        String[] toks = new String[spans.length];
        int tl = toks.length;
        for (int ti = 0; ti < tl; ++ti) {
            toks[ti] = s.substring(spans[ti].getStart(), spans[ti].getEnd());
        }
        return toks;
    }

    public static Span[] split(String d) {
        int tokStart = -1;
        ArrayList<Span> tokens = new ArrayList<Span>();
        boolean inTok = false;
        int end = d.length();
        for (int i = 0; i < end; ++i) {
            if (Character.isWhitespace(d.charAt(i))) {
                if (!inTok) continue;
                tokens.add(new Span(tokStart, i));
                inTok = false;
                tokStart = -1;
                continue;
            }
            if (inTok) continue;
            tokStart = i;
            inTok = true;
        }
        if (inTok) {
            tokens.add(new Span(tokStart, end));
        }
        return tokens.toArray(new Span[tokens.size()]);
    }

    public static void train(EventStream evc, File output) throws IOException {
        GISModel tokModel = GIS.trainModel((int)100, (DataIndexer)new TwoPassDataIndexer(evc, 5), (boolean)true);
        new SuffixSensitiveGISModelWriter(tokModel, output).persist();
    }

    public static void train(String[] args) throws IOException {
        FileReader datafr = new FileReader(new File(args[0]));
        File output = new File(args[1]);
        EventCollectorAsStream evc = new EventCollectorAsStream((EventCollector)new TokEventCollector(datafr));
        TokenizerME.train((EventStream)evc, output);
    }

    public void setAlphaNumericOptimization(boolean opt) {
        this.ALPHA_NUMERIC_OPTIMIZATION = opt;
    }

    public boolean useAlphaNumericOptimization() {
        return this.ALPHA_NUMERIC_OPTIMIZATION;
    }

    public static void main(String[] args) throws IOException {
        TokenizerME.train(args);
    }
}

