/*
 * Copyright (c) 2003 by SAP AG. All Rights Reserved.
 *
 * SAP, mySAP, mySAP.com and other SAP products and
 * services mentioned herein as well as their respective
 * logos are trademarks or registered trademarks of
 * SAP AG in Germany and in several other countries all
 * over the world. MarketSet and Enterprise Buyer are
 * jointly owned trademarks of SAP AG and Commerce One.
 * All other product and service names mentioned are
 * trademarks of their respective companies.
 *
 * @version $Id$
 */

package com.sapportals.wcm.util.html;

import com.sap.tc.logging.Location;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

/**
 * HTMLReaderImpl is a default implementatin of IHTMLReader. <p>
 *
 * Copyright (c) SAP AG 2001-2003
 *
 * @author stefan.eissing@greenbytes.de
 * @version $Id: HTMLReaderImpl.java,v 1.17 2003/03/20 18:41:30 jre Exp $
 */
class HTMLReaderImpl implements IHTMLReader {

  private final static Location log = com.sap.tc.logging.Location.getLocation(com.sapportals.wcm.util.html.HTMLReaderImpl.class);

  /**
   * Definition of states the reader can be in.
   */
  private final static int S_START = 0;// start of document
  private final static int S_PRETAG = 1;// possible tag, '<' seen
  private final static int S_TEXT = 2;// text recognition
  private final static int S_TAG = 3;// element tag
  private final static int S_ENDTAG = 4;// end element tag
  private final static int S_COMMENT = 5;// comment
  private final static int S_PI = 6;// processing instruction
  private final static int S_DOCTYPE = 7;// DOCTYPE declaration
  private final static int S_CDATA = 8;// CDATA section
  private final static int S_CDATA_TAG = 9;// implied CDATA section for script and pre tags

  /**
   * The content handler to receive events.
   */
  private IHTMLContentHandler m_content;
  /**
   * The input to read characters from.
   */
  private HTMLInput m_in;
  /**
   * element to use in element events.
   */
  private final HTMLElementImpl m_element;
  /**
   * Current state of parser
   */
  private int m_state;
  /**
   * If document has started yet.
   */
  private boolean m_docStarted;
  /**
   * name of tag to end implied CDATA section, like script or style
   */
  private String m_cdataTagName;
  
  private final boolean debug;

  protected HTMLReaderImpl() {
    m_element = new HTMLElementImpl();
    this.debug = log.beDebug();
  }

  public IHTMLContentHandler getContentHandler() {
    return m_content;
  }

  public void setContentHandler(IHTMLContentHandler handler) {
    m_content = handler;
  }

  public void setSource(InputStream input)
    throws HTMLException, IOException {
    m_in = new HTMLInput(input);
    prepareStart();
  }

  public void setSource(InputStream input, String encoding)
    throws HTMLException, IOException {
    m_in = new HTMLInput(input, encoding);
    prepareStart();
  }

  public void setSource(Reader input)
    throws HTMLException, IOException {
    m_in = new HTMLInput(input);
    prepareStart();
  }

  public String getEncoding()
    throws HTMLException, IOException {
    if (m_in != null) {
      return m_in.getEncoding();
    }
    return null;
  }

  public void parse()
    throws HTMLException, IOException {
    ensureInput();
    while (internalParseNextEvent()) {
      // loop
    }
  }

  public boolean parseNextEvent()
    throws HTMLException, IOException {
    ensureInput();
    return internalParseNextEvent();
  }

  public void discard() {
    if (m_in != null) {
      m_in.discard();
      m_in = null;
    }
  }

  // ----------------------- private ------------------------------------

  /**
   * Parse and generate the next event. Return if there are more events
   * available.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean internalParseNextEvent()
    throws HTMLException, IOException {
    boolean result = false;
    if (!m_in.eof()) {
      switch (m_state) {
        case S_START:
          result = parseDocument();
          break;
        case S_PRETAG:
          result = parsePreTag();
          break;
        case S_ENDTAG:
          result = parseEndTag();
          break;
        case S_TEXT:
          result = parseText();
          break;
        case S_TAG:
          result = parseStartTag();
          break;
        case S_PI:
          result = parsePI();
          break;
        case S_COMMENT:
          result = parseComment();
          break;
        case S_DOCTYPE:
          result = parseDOCTYPE();
          break;
        case S_CDATA:
          result = parseCDATA();
          break;
        case S_CDATA_TAG:
          result = parseCDATATag();
          break;
        default:
          log.warningT("internalParseNextEvent(180)", "unrecognized parser state: " + m_state);
          m_state = S_TEXT;
          result = internalParseNextEvent();
          break;
      }
    }

    if (!result && m_docStarted && m_content != null) {
      m_content.endDocument();
    }
    return result;
  }

  /**
   * See, if we can read at least on character. Empty documents do not count.
   * Generate startDocument event and switch to S_TEXT mode of parsing.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parseDocument()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parseDocument(204)", "parseDocument()");
    }
    int c = m_in.next();
    switch (c) {
      case -1:
        return false;
      default:
        m_in.pushBack();
        m_state = S_TEXT;
        break;
    }

    if (m_content != null) {
      m_content.startDocument();
    }
    m_docStarted = true;
    return true;
  }

  /**
   * Detect a possible start of TAG, ENDTAG, CDATA, DOCTYPE, COMMENT or
   * PROCESSING INSTRUCTION. Parse TEXT as fallback. Consume at least the
   * starting '<' character to avoid endless loops in detection.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parsePreTag()
    throws HTMLException, IOException {
    int c = m_in.next();
    switch (c) {
      case -1:
        return false;
      case '<':
        break;
      default:
        if (this.debug) {
          log.debugT("parsePreTag(242)", "unexpected tag start: " + m_in.location());
        }
        m_state = S_TEXT;
        return internalParseNextEvent();
    }

    c = m_in.next();
    switch (c) {
      case -1:
        throw new HTMLException("unexpected end of input: " + m_in.location());
      case '!':
      {
        c = m_in.next();
        switch (c) {
          case -1:
            throw new HTMLException("unexpected end of input: " + m_in.location());
          case '-':
            m_in.pushBack();
            m_state = S_COMMENT;
            break;
          case '[':
            m_in.pushBack();
            m_state = S_CDATA;
            break;
          case 'D':
            m_in.pushBack();
            m_state = S_DOCTYPE;
            break;
          default:
            m_state = S_TEXT;
            break;
        }
      }
        break;
      case '/':
        m_state = S_ENDTAG;
        break;
      case '?':
        m_state = S_PI;
        break;
      default:
        m_in.pushBack();
        m_state = S_TAG;
        break;
    }

    return internalParseNextEvent();
  }

  /**
   * Parse text until '<'. Generate event if characters have been collected.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parseText()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parseText(301)", "parseText()");
    }
    int c = -1;
    try {
      loop :
      while (true) {
        c = m_in.next();
        switch (c) {
          case -1:
            break loop;
          case '<':
            m_in.pushBack();
            m_state = S_PRETAG;
            break loop;
          default:
            break;
        }
      }
    }
    catch (HTMLBufferOverrunException ex) {
            //$JL-EXC$      
      // ignore and generate text event
    }

    if (generateTextEvent()) {
      return true;
    }
    else if (c != -1) {
      return internalParseNextEvent();
    }
    else {
      return false;
    }
  }

  /**
   * Parse processing instruction until '?>'.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parsePI()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parsePI(345)", "parsePI()");
    }
    try {
      boolean qmark = false;
      loop :
      while (true) {
        int c = m_in.next();
        switch (c) {
          case -1:
            break loop;
          case '?':
            qmark = true;
            break;
          case '>':
            if (qmark) {
              break loop;
            }
            break;
          default:
            qmark = false;
            break;
        }
      }
    }
    catch (HTMLBufferOverrunException e) {
            //$JL-EXC$
      // ignore and generate text event
    }

    generateTextEvent();
    m_state = S_TEXT;
    return !m_in.eof();
  }

  /**
   * Parse comment until '-->'.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parseComment()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parseComment(388)", "parseComment()");
    }

    if (!m_in.match("--")) {
      log.warningT("parseComment(392)", "malformed comment start, assuming text");
      m_state = S_TEXT;
      return internalParseNextEvent();
    }

    int dashes = 0;
    loop :
    while (true) {
      try {
        int c = m_in.next();
        switch (c) {
          case -1:
            break loop;
          case '-':
            ++dashes;
            break;
          case '>':
            if (dashes > 1) {
              break loop;
            }
            dashes = 0;
            break;
          default:
            dashes = 0;
            break;
        }
      }
      catch (HTMLBufferOverrunException ex) {
            //$JL-EXC$        
        // ignore and generate event
        generateTextEvent();
      }
    }

    generateTextEvent();
    m_state = S_TEXT;
    return !m_in.eof();
  }

  /**
   * Parse DOCTYPE declaration until '>'. Balance '<' and '>' as doctypes use
   * these in ELEMENT and other declaractions. Does not know much about internal
   * structure of doctypes, but hopefully enough to survive any.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parseDOCTYPE()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parseDOCTYPE(442)", "parseDOCTYPE()");
    }
    if (!m_in.match("DOCTYPE")) {
      log.warningT("parseDOCTYPE(445)", "malformred DOCTYPE start, assuming text");
      m_state = S_TEXT;
      return internalParseNextEvent();
    }

    try {
      int open = 0;
      loop :
      while (true) {
        int c = m_in.next();
        switch (c) {
          case -1:
            break loop;
          case '<':
            ++open;
          case '>':
            --open;
            if (open <= 0) {
              break loop;
            }
            break;
          default:
            break;
        }
      }
    }
    catch (HTMLBufferOverrunException e) {
            //$JL-EXC$
      // ignore and generate event
    }

    generateTextEvent();
    m_state = S_TEXT;
    return !m_in.eof();
  }

  /**
   * Parse CDATA section until ']]>'. If buffer is not large enough to hold the
   * complete section, generate more than one text event.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parseCDATA()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parseCDATA(491)", "parseCDATA()");
    }

    int needed = 0;
    if (m_in.match("[")) {
      if (m_in.match("CDATA[")) {
        needed = 2;
      }
      else {
        // This is found in HTML documents generated by MS-Word
        // seems to be an SGML thing
        needed = 1;
      }
    }
    else {
      if (this.debug) {
        log.debugT("parseCDATA(507)", "malformed CDATA start, assuming text: " + m_in.location());
      }
      m_state = S_TEXT;
      return internalParseNextEvent();
    }

    int brackets = 0;
    try {
      loop :
      while (true) {
        int c = m_in.next();
        switch (c) {
          case -1:
          case '>':
            if (brackets >= needed) {
              break loop;
            }
            brackets = 0;
            break;
          case ']':
            ++brackets;
            break;
          default:
            brackets = 0;
            break;
        }
      }
    }
    catch (HTMLBufferOverrunException ex) {
            //$JL-EXC$
      // ignore and generate text event
    }

    generateTextEvent();
    m_state = S_TEXT;
    return !m_in.eof();
  }

  /**
   * Parse implied CDATA section until matching end element tag with name
   * m_cdataTagName (e.g. </script> ). If buffer is exhausted while matching the
   * end tag name, retrack to position before matching, generate text event and
   * keep the internal state of the parser. Then the next event has a chance to
   * detect the end tag.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parseCDATATag()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parseCDATATag(558)", "parseCDATATag(" + m_cdataTagName + ")");
    }

    boolean endTagFound = false;
    try {
      loop :
      while (true) {
        m_in.readUntil('<');
        int c = m_in.next();
        switch (c) {
          case '<':
            int mark = m_in.getLength();
            try {
              c = m_in.next();
              if (c == '/') {
                // could be an end tag, but is it the correct one?
                if (m_in.matchIgnoreCase(m_cdataTagName)) {
                  // We have to pushback all characters before the tag start
                  for (int i = 0; i < m_cdataTagName.length(); ++i) {
                    m_in.pushBack();
                  }
                  m_in.pushBack();// '/'
                  m_in.pushBack();// '<'
                  endTagFound = true;
                  break loop;
                }
              }
            }
            catch (HTMLBufferOverrunException ex) {
            //$JL-EXC$
              // our buffer is full, the whole style/script section does
              // not fit. We push back the characters read so far and generate
              // a TEXT event. That will send all data *before* the suspected end tag
              // (which we were just investigating) to the ContentHandler.
              // The buffer is the shortened and we can try to detect the end
              // tag again, starting at the '<' character which we had seen.
              //
              int off = m_in.getLength() - mark;
              for (int i = 0; i < off; ++i) {
                m_in.pushBack();
              }
              m_in.pushBack();
              break loop;
            }
            break;
          default:
            break loop;
        }
      }
    }
    catch (HTMLBufferOverrunException ex) {
            //$JL-EXC$      
      // ignore and generate text event
    }

    if (endTagFound) {
      m_state = S_TEXT;
    }

    if (!generateTextEvent()) {
      return internalParseNextEvent();
    }
    return !m_in.eof();
  }

  /**
   * Parse a start tag (element start) until '>'. Actual parsing is done in
   * m_element. If a non-empty 'script' or 'style' element was parsed, switch
   * parser to recognize implied CDATA sections, otherwise switch back to text.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parseStartTag()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parseStartTag(625)", "parseStartTag()");
    }
    int c = m_in.next();
    if (!Character.isLetter((char)c)) {
      if (this.debug) {
        log.debugT("parseStartTag(630)", "non-letter start of tag: " + m_in.location());
      }
      m_state = S_TEXT;
      return internalParseNextEvent();
    }

    try {
      m_in.pushBack();
      m_element.parse(m_in);

      if (this.debug) {
        log.debugT("parseStartTag(641)", "parseStartTag(): parsed " + m_element);
      }
      if (m_content != null) {
        m_content.startElement(m_element);
      }
      m_in.markAsRead();

      if (!m_element.isEmpty()) {
        switch (m_element.getNameLength()) {
          case 5:
            if (m_element.hasName("style")) {
              m_state = S_CDATA_TAG;
              m_cdataTagName = "style";
            }
            else {
              m_state = S_TEXT;
            }
            break;
          case 6:
            if (m_element.hasName("script")) {
              m_state = S_CDATA_TAG;
              m_cdataTagName = "script";
            }
            else {
              m_state = S_TEXT;
            }
            break;
          default:
            m_state = S_TEXT;
            break;
        }
      }
      else {
        m_state = S_TEXT;
      }
      return !m_in.eof();
    }
    catch (HTMLException e) {
      if (log.beInfo()) {
        log.infoT("parseStartTag(680)", "parsing tag: " + m_in.location() + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(e));
      }
      m_state = S_TEXT;
      return internalParseNextEvent();
    }
  }

  /**
   * Parse and end tag (e.g. starting with '</') until '> '. Actual parsing is
   * done by m_element.
   *
   * @return TBD: Description of the outgoing return value
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean parseEndTag()
    throws HTMLException, IOException {
    if (this.debug) {
      log.debugT("parseEndTag(698)", "parseEndTag()");
    }
    int c = m_in.next();
    if (!Character.isLetter((char)c)) {
      m_state = S_TEXT;
      return internalParseNextEvent();
    }

    try {
      m_in.pushBack();
      m_element.parse(m_in);
      if (this.debug) {
        log.debugT("parseEndTag(710)", "parseEndTag(): parsed " + m_element);
      }
      if (m_content != null) {
        m_content.endElement(m_element);
      }

      m_in.markAsRead();
      m_state = S_TEXT;
      return !m_in.eof();
    }
    catch (HTMLException e) {
      if (log.beInfo()) {
        log.infoT("parseEndTag(722)", "parsing end tag: " + m_in.location() + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(e));
      }
      m_state = S_TEXT;
      return internalParseNextEvent();
    }
  }

  /**
   * Generate a text event from the data read in m_in, if any characters have
   * been read.
   *
   * @return if event was generated
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private boolean generateTextEvent()
    throws HTMLException, IOException {
    int len = m_in.getLength();
    if (len <= 0) {
      return false;
    }

    if (m_content != null) {
      m_content.characters(m_in.getBuffer(), m_in.getOffset(), len);
    }
    m_in.markAsRead();
    return true;
  }

  /**
   * Resets parsers internal state to document start.
   */
  private void reset() {
    m_state = S_START;
    m_docStarted = false;
    if (m_in != null) {
      m_in.reset();
    }
  }

  /**
   * Check if input source is properly set up.
   *
   * @throws HTMLException on misconfigured input
   */
  private void ensureInput()
    throws HTMLException {
    if (m_in == null) {
      throw new HTMLException("reader missing input");
    }
  }

  /**
   * Prepare reading of document. Give m_in a chance to detect the encoding if
   * needed.
   *
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private void prepareStart()
    throws HTMLException, IOException {
    m_in.init(this);
    this.reset();
  }

}
