package net.os10000.bldsys.app_zeitgeist_v2;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Calendar;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.os10000.bldsys.lib_logger.Logger;
import net.os10000.bldsys.mod_concurrent.ParallelProcessor;

/* loaded from: input_file:net/os10000/bldsys/app_zeitgeist_v2/HtmlToStruc.class */
public class HtmlToStruc implements DoWork {
    static long doc_id = 0;
    static long begin_year = 0;
    static long end_year = 0;
    static Map m = new HashMap();
    static Pattern p = Pattern.compile("^<A NAME=[0-9]*>(.*)$");
    private static boolean got_first = false;
    private static long min_day = 0;
    private static long max_day = 0;

    /* loaded from: input_file:net/os10000/bldsys/app_zeitgeist_v2/HtmlToStruc$Consumer.class */
    public static class Consumer implements ParallelProcessor.Consumer {
        long doc;
        FileOutputStream fos;
        ObjectOutputStream oos;
        Logger l;

        public Consumer(Logger logger, String str) {
            try {
                this.doc = 0L;
                this.fos = new FileOutputStream(str);
                this.oos = new ObjectOutputStream(this.fos);
            } catch (Exception e) {
                logger.log_stacktrace(e);
            }
        }

        @Override // net.os10000.bldsys.mod_concurrent.ParallelProcessor.Consumer
        public void put(Object obj) {
            try {
                if (obj == null) {
                    this.oos.close();
                } else {
                    Document document = (Document) obj;
                    if (HtmlToStruc.begin_year <= document.year && document.year < HtmlToStruc.end_year) {
                        this.oos.writeObject(obj);
                    }
                    this.doc++;
                }
            } catch (Exception e) {
                this.l.log_stacktrace(e);
            }
        }

        @Override // net.os10000.bldsys.mod_concurrent.ParallelProcessor.StatSource
        public String stats(int i) {
            return "doc=" + this.doc;
        }
    }

    /* loaded from: input_file:net/os10000/bldsys/app_zeitgeist_v2/HtmlToStruc$myProcessor.class */
    public static class myProcessor implements ParallelProcessor.Callable {
        String fn;
        Logger l;

        public myProcessor(Logger logger, String str) {
            this.l = logger;
            this.fn = str;
        }

        private String break_sentences(String str, String str2) {
            BreakIterator sentenceInstance = BreakIterator.getSentenceInstance(str2.equals("en") ? Locale.UK : Locale.GERMAN);
            sentenceInstance.setText(str);
            String str3 = "";
            String str4 = "";
            int first = sentenceInstance.first();
            int next = sentenceInstance.next();
            while (true) {
                int i = next;
                if (i == -1) {
                    return str3;
                }
                String substring = str.substring(first, i);
                if (str.charAt(first) == 183) {
                    str3 = str3 + substring;
                } else {
                    str3 = str3 + str4 + substring;
                    str4 = "\t";
                }
                first = i;
                next = sentenceInstance.next();
            }
        }

        private String process_line(String str) {
            String str2 = "|" + str.replaceAll(" ", "·");
            for (int length = 75 - str2.length(); length > 0; length--) {
                str2 = str2 + "·";
            }
            return str2 + " ";
        }

        private String process_block(String str) {
            String str2 = " ";
            int i = 0;
            int indexOf = str.indexOf(10);
            while (true) {
                int i2 = indexOf;
                if (i2 <= -1) {
                    return str2 + process_line(str.substring(i));
                }
                str2 = str2 + process_line(str.substring(i, i2));
                i = i2 + 1;
                indexOf = str.indexOf(10, i);
            }
        }

        private String fix_preformatted(String str) {
            int i;
            int i2;
            String str2 = "";
            int i3 = 0;
            int indexOf = str.indexOf("<PRE>");
            while (true) {
                int i4 = indexOf;
                if (i4 <= -1) {
                    return str2 + str.substring(i3);
                }
                str2 = str2 + str.substring(i3, i4);
                int indexOf2 = str.indexOf("</PRE>", i3);
                if (indexOf2 > -1) {
                    str2 = str2 + process_block(str.substring(i4 + 5, indexOf2));
                    i = indexOf2;
                    i2 = 6;
                } else {
                    i = i4;
                    i2 = 5;
                }
                i3 = i + i2;
                indexOf = str.indexOf("<PRE>", i3);
            }
        }

        private String fix_joined_sentences(String str) {
            return str.replaceAll("(D(er|ie|as)[^a-z])", " $1").replaceAll("\\.([A-Z])", ". $1");
        }

        private String fix_sonderzeichen(String str) {
            return str.replaceAll("/", "/ ").replaceAll("=20", " ").replaceAll("=2E", ".").replaceAll("=3D", "=").replaceAll("=AB", " << ").replaceAll("=B4", "'").replaceAll("=BB", " >> ").replaceAll("=C4", "Ä").replaceAll("=D6", "Ö").replaceAll("=DC", "Ü").replaceAll("=DF", "ß").replaceAll("=E4", "ä").replaceAll("=F6", "ö").replaceAll("=FC", "ü").replaceAll("=E9", "e").replaceAll("=ED", "i").replaceAll("=B0", " deg ");
        }

        private void final_cleanup(Document document, String str) {
            String[] split = break_sentences(fix_joined_sentences(fix_sonderzeichen(fix_preformatted(str)).replaceAll("([a-z])([A-Z])", "$1 $2")), document.lang).replaceAll("([A-Za-z]*)[-:]([A-Za-z]*)", "$1 $2").replaceAll("[\\n]", " ").replaceAll("  *", " ").replaceAll("[\\t] ", "\t").replaceAll(" [\\t]", "\t").replaceAll(" $", "").split("\t");
            LinkedList linkedList = new LinkedList();
            for (String str2 : split) {
                String replaceAll = str2.replaceAll(" +", " ").replaceAll("^ +", "").replaceAll(" +$", "").replaceAll("([a-zA-Z])[:(-]([0-9])", "$1 $2");
                if (replaceAll.length() > 1) {
                    linkedList.add(replaceAll.split(" "));
                }
            }
            document.articles.add(linkedList);
        }

        private Document mk_doc(String str) {
            String[] split = str.replace('\\', '/').split("/");
            int length = split.length - 5;
            String str2 = (String) HtmlToStruc.m.get(split[length + 1]);
            String str3 = split[length + 2];
            String str4 = split[length + 3];
            String substring = split[length + 4].substring(0, 2);
            Calendar cal = utils.cal(str3, str4, substring);
            long day = utils.day(cal);
            if (HtmlToStruc.got_first) {
                long unused = HtmlToStruc.min_day = HtmlToStruc.min_day < day ? HtmlToStruc.min_day : day;
                long unused2 = HtmlToStruc.max_day = HtmlToStruc.max_day > day ? HtmlToStruc.max_day : day;
            } else {
                long unused3 = HtmlToStruc.min_day = day;
                long unused4 = HtmlToStruc.max_day = day;
                boolean unused5 = HtmlToStruc.got_first = true;
            }
            int parseInt = Integer.parseInt(str3);
            int parseInt2 = Integer.parseInt(str4);
            int parseInt3 = Integer.parseInt(substring);
            int dow = utils.dow(cal);
            HtmlToStruc.doc_id++;
            return new Document(HtmlToStruc.doc_id, str, str2, day, parseInt, parseInt2, parseInt3, dow);
        }

        @Override // java.util.concurrent.Callable
        public Object call() {
            Document document = null;
            try {
                String str = "";
                String str2 = "";
                BufferedReader bufferedReader = new BufferedReader(new FileReader(this.fn));
                for (String readLine = bufferedReader.readLine(); readLine != null; readLine = bufferedReader.readLine()) {
                    String str3 = readLine.replaceAll("\r", "") + "\n";
                    if (str3.startsWith("<A NAME=")) {
                        if (document == null) {
                            document = mk_doc(this.fn);
                        } else {
                            final_cleanup(document, str);
                        }
                        str = "";
                        str2 = "";
                    }
                    String replaceAll = str3.replaceAll("(&nbsp;|\\t)", " ").replaceAll("^<title>.*</title>$", "").replaceAll("<[/]*(html|head|head|center|tr|td|BR|B|I|font|table|TR|TD|TABLE|CENTER|BODY|P)>", "").replaceAll("<(body|table|font|TABLE)[^<>]*>", "").replaceAll("^<A HREF=#[0-9]*>.*</A>$", "");
                    Matcher matcher = HtmlToStruc.p.matcher(replaceAll);
                    if (matcher.find()) {
                        replaceAll = matcher.group(1) + "\t";
                    }
                    str = str + str2 + replaceAll;
                    str2 = " ";
                }
                if (document == null) {
                    this.l.logln("fn=" + this.fn + ", doc=null");
                } else {
                    final_cleanup(document, str);
                }
                bufferedReader.close();
            } catch (Exception e) {
                this.l.log_stacktrace(e);
            }
            return document;
        }
    }

    static void process_dir(Logger logger, File file, ParallelProcessor.Processor processor) throws IOException {
        String[] list = file.list();
        Arrays.sort(list);
        for (String str : list) {
            File file2 = new File(file, str);
            if (!file2.isFile()) {
                process_dir(logger, file2, processor);
            } else if (file2.length() > 0) {
                processor.put(new myProcessor(logger, file2.toString()));
            }
        }
    }

    @Override // net.os10000.bldsys.app_zeitgeist_v2.DoWork
    public void work(Logger logger, String[] strArr) {
        try {
            String str = strArr[2] + File.separator;
            int parseInt = Integer.parseInt(strArr[3]);
            int parseInt2 = Integer.parseInt(strArr[4]);
            String str2 = strArr[5];
            String str3 = strArr[6];
            String str4 = strArr[7];
            String str5 = strArr[8];
            begin_year = Integer.parseInt(strArr[9]);
            end_year = Integer.parseInt(strArr[10]);
            logger.loglnts("corpus=" + str2);
            logger.loglnts("cleaned=" + str4);
            m.put("dn", "en");
            m.put("gn", "de");
            ParallelProcessor.Processor process = ParallelProcessor.process(logger, null, new Consumer(logger, str + str4), parseInt, parseInt2, 100L);
            process.start_stats(20, false);
            for (String str6 : str3.split(" ")) {
                process_dir(logger, new File(new File(str + str2), str6), process);
            }
            process.put(null);
            process.wait_until_done();
            FileWriter fileWriter = new FileWriter(str + str5);
            fileWriter.write(min_day + " " + max_day);
            fileWriter.close();
            m = new HashMap();
        } catch (Exception e) {
            logger.log_stacktrace(e);
        }
    }
}
