package org.apache.tika.parser.html;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.utils.Utils;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/* JADX WARN: Classes with same name are omitted:
  input_file:lib_apache_tika/lib/tika-0.2-jdk14.jar:org/apache/tika/parser/html/HtmlParser.class
 */
/* loaded from: input_file:lib_apache_tika/lib/tika-0.2.jar:org/apache/tika/parser/html/HtmlParser.class */
public class HtmlParser implements Parser {
    private static final Map<String, String> SAFE_ELEMENTS = new HashMap();
    private static final Set<String> DISCARD_ELEMENTS = new HashSet();

    @Override // org.apache.tika.parser.Parser
    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata) throws IOException, SAXException, TikaException {
        CloseShieldInputStream closeShieldInputStream = new CloseShieldInputStream(inputStream);
        XHTMLContentHandler xHTMLContentHandler = new XHTMLContentHandler(contentHandler, metadata);
        XPathParser xPathParser = new XPathParser(null, "");
        TeeContentHandler teeContentHandler = new TeeContentHandler(new MatchingContentHandler(getBodyHandler(xHTMLContentHandler), xPathParser.parse("/HTML/BODY//node()")), new MatchingContentHandler(getTitleHandler(metadata), xPathParser.parse("/HTML/HEAD/TITLE//node()")), new MatchingContentHandler(getMetaHandler(metadata), xPathParser.parse("/HTML/HEAD/META//node()")));
        xHTMLContentHandler.startDocument();
        SAXParser sAXParser = new SAXParser();
        sAXParser.setContentHandler(new XHTMLDowngradeHandler(teeContentHandler));
        sAXParser.parse(new InputSource(Utils.getUTF8Reader(closeShieldInputStream, metadata)));
        xHTMLContentHandler.endDocument();
    }

    private ContentHandler getTitleHandler(final Metadata metadata) {
        return new WriteOutContentHandler() { // from class: org.apache.tika.parser.html.HtmlParser.1
            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void endElement(String str, String str2, String str3) {
                metadata.set("title", toString());
            }
        };
    }

    private ContentHandler getMetaHandler(final Metadata metadata) {
        return new WriteOutContentHandler() { // from class: org.apache.tika.parser.html.HtmlParser.2
            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                if (attributes.getValue("http-equiv") != null) {
                    metadata.set(attributes.getValue("http-equiv"), attributes.getValue("content"));
                }
                if (attributes.getValue("name") != null) {
                    metadata.set(attributes.getValue("name"), attributes.getValue("content"));
                }
            }
        };
    }

    private ContentHandler getBodyHandler(final XHTMLContentHandler xHTMLContentHandler) {
        return new TextContentHandler(xHTMLContentHandler) { // from class: org.apache.tika.parser.html.HtmlParser.3
            private int discardLevel = 0;

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                if (this.discardLevel != 0) {
                    this.discardLevel++;
                    return;
                }
                if (HtmlParser.DISCARD_ELEMENTS.contains(str3)) {
                    this.discardLevel = 1;
                    return;
                }
                if (HtmlParser.SAFE_ELEMENTS.containsKey(str3)) {
                    xHTMLContentHandler.startElement((String) HtmlParser.SAFE_ELEMENTS.get(str3));
                } else if ("A".equals(str3)) {
                    String value = attributes.getValue("href");
                    if (value == null) {
                        value = "";
                    }
                    xHTMLContentHandler.startElement("a", "href", value);
                }
            }

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void endElement(String str, String str2, String str3) throws SAXException {
                if (this.discardLevel != 0) {
                    this.discardLevel--;
                } else if (HtmlParser.SAFE_ELEMENTS.containsKey(str3)) {
                    xHTMLContentHandler.endElement((String) HtmlParser.SAFE_ELEMENTS.get(str3));
                } else if ("A".equals(str3)) {
                    xHTMLContentHandler.endElement("a");
                }
            }

            @Override // org.apache.tika.sax.TextContentHandler, org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void characters(char[] cArr, int i, int i2) throws SAXException {
                if (this.discardLevel == 0) {
                    super.characters(cArr, i, i2);
                }
            }

            @Override // org.apache.tika.sax.TextContentHandler, org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void ignorableWhitespace(char[] cArr, int i, int i2) throws SAXException {
                if (this.discardLevel == 0) {
                    super.ignorableWhitespace(cArr, i, i2);
                }
            }
        };
    }

    static {
        SAFE_ELEMENTS.put("P", "p");
        SAFE_ELEMENTS.put("H1", "h1");
        SAFE_ELEMENTS.put("H2", "h2");
        SAFE_ELEMENTS.put("H3", "h3");
        SAFE_ELEMENTS.put("H4", "h4");
        SAFE_ELEMENTS.put("H5", "h5");
        SAFE_ELEMENTS.put("H6", "h6");
        SAFE_ELEMENTS.put("UL", "ul");
        SAFE_ELEMENTS.put("OL", "ol");
        SAFE_ELEMENTS.put("LI", "li");
        SAFE_ELEMENTS.put("DL", "dl");
        SAFE_ELEMENTS.put("DT", "dt");
        SAFE_ELEMENTS.put("DD", "dd");
        SAFE_ELEMENTS.put("PRE", "pre");
        SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
        SAFE_ELEMENTS.put("TABLE", "p");
        DISCARD_ELEMENTS.add("STYLE");
        DISCARD_ELEMENTS.add("SCRIPT");
    }
}
