/*
 * Copyright (c) 2003 by SAP AG. All Rights Reserved.
 *
 * SAP, mySAP, mySAP.com and other SAP products and
 * services mentioned herein as well as their respective
 * logos are trademarks or registered trademarks of
 * SAP AG in Germany and in several other countries all
 * over the world. MarketSet and Enterprise Buyer are
 * jointly owned trademarks of SAP AG and Commerce One.
 * All other product and service names mentioned are
 * trademarks of their respective companies.
 *
 * @version $Id$
 */

package com.sapportals.wcm.util.html;

import com.sapportals.wcm.WcmObject;

import java.io.*;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

/**
 * HtmlTokenizer <p>
 *
 * Copyright (c) SAP AG 2001-2003
 *
 * @author roland.preussmann@sapportals.com
 * @version $Id: HtmlTokenizer.java,v 1.12 2003/03/20 18:41:30 jre Exp $
 */
public class HtmlTokenizer extends WcmObject {

  /**
   * end of file
   */
  public final static int TOKEN_EOF = -1;
  /**
   * text token
   */
  public final static int TOKEN_TEXT = 1;
  /**
   * tag token
   */
  public final static int TOKEN_TAG = 2;
  /**
   * comment token
   */
  public final static int TOKEN_COMMENT = 3;

  private final static com.sap.tc.logging.Location log = com.sap.tc.logging.Location.getLocation(com.sapportals.wcm.util.html.HtmlTokenizer.class);

  private final static String DEFAULT_ENCODING = "ISO-8859-1";
  private final static String CHARSET = "charset=";

  private final static int STATE_EOF = 0;
  private final static int STATE_TEXT = 1;
  private final static int STATE_TAG = 2;
  private final static int STATE_TAG_QUOTE = 3;
  private final static int STATE_COMMENT = 4;

  private final static int INPUT_BUFFER_SIZE = 4 * 1024;

  // private static final String EXHAUSTED = "look-ahead exhausted";

  private final static Map g_encodings;

  static {
    Map encodings = new HashMap(31);

    encodings.put("utf-8", "UTF-8");
    encodings.put("unicode-1-1-utf-8", "UTF-8");
    encodings.put("unicode-2-0-utf-8", "UTF-8");
    encodings.put("x-unicode-2-0-utf-8", "UTF-8");

    encodings.put("x-sjis", "x-sjis");
    encodings.put("ms_kanji", "x-sjis");
    encodings.put("csshiftjis", "x-sjis");
    encodings.put("shift_jis", "x-sjis");
    encodings.put("cswindow31j", "x-sjis");
    encodings.put("x-ms-cp932", "x-sjis");

    g_encodings = Collections.unmodifiableMap(encodings);
  }

  private int m_state;
  private int m_type;
  private int m_comment;
  private int m_tagquote;
  private final StringBuffer m_token;

  private String m_encoding;

  private Reader m_reader;
  private InputStream m_input;

  private final char[] m_buffer;
  private int m_bufferIndex;
  private int m_bufferSize;
  private int m_available;
  private char m_pushback;
  private boolean m_limitLoading;

  /**
   * Create a new HtmlTokenizer <pre>
   * InputStream is = <get inputstream to the HTML>;
   * HtmlTokenizer htmlTokenizer = new HtmlTokenizer(is);
   * while (htmlTokenizer.next() != HtmlTokenizer.TOKEN_EOF) {
   *   int type = htmlTokenizer.getTokenType();
   *   if (type == HtmlTokenizer.TOKEN_TAG) {
   *     System.out.println("tag: " + htmlTokenizer.getToken());
   *   }
   *   else if (type == HtmlTokenizer.TOKEN_TEXT) {
   *     System.out.println("text: " + htmlTokenizer.getToken());
   *   }
   *   else if (type == HtmlTokenizer.TOKEN_COMMENT) {
   *     System.out.println("comment: " + htmlTokenizer.getToken());
   *   }
   * </pre>
   *
   * @param in input stream
   */
  public HtmlTokenizer(InputStream in) {

    m_input = in;
    m_buffer = new char[INPUT_BUFFER_SIZE];
    m_token = new StringBuffer();
    m_encoding = DEFAULT_ENCODING;

    reset();

    try {
      setupReader();
    }
    catch (Exception ex) {
      log.warningT("HtmlTokenizer(137)", "setting up reader" + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(ex));
    }

    log.debugT("HtmlTokenizer(140)", "new HtmlTokenizer");
  }

  /**
   * @return the encoding of the HTML page
   */
  public String getEncoding() {
    return m_encoding;
  }

  /**
   * @return the last token type. See <code>TOKEN_</code> defines
   */
  public int getTokenType() {
    return m_type;
  }

  /**
   * this method returns the current token as a String.
   *
   * @return token
   */
  public String getToken() {
    if (m_type == TOKEN_TEXT) {
      return m_token.toString();
    }
    else if (m_type == TOKEN_TAG) {
      return "<" + m_token.toString() + ">";
    }
    else if (m_type == TOKEN_COMMENT) {
      return "<!--" + m_token.toString() + "-->";
    }
    else {
      return "";
    }
  }

  /**
   * this method returns raw content of last token, e.g. a tag without "&lt;"
   * and "&gt;".
   *
   * @return tokenContent
   */
  public String getTokenContent() {
    return m_token.toString();
  }

  /**
   * Parse the input stream and returns the type of the next token. type is one
   * of the <code>TOKEN_</code> defines:
   *
   * @return TBD: Description of the outgoing return value
   * @exception IOException Exception raised in failure situation
   */
  public int next()
    throws IOException {
    if (m_state == STATE_EOF) {
      m_type = TOKEN_EOF;
      return m_type;
    }

    m_token.setLength(0);

    while (true) {
      char c;

      try {
        if (eof()) {
          if (m_state == STATE_TEXT && m_token.length() > 0) {
            m_type = TOKEN_TEXT;
          }
          else {
            m_type = TOKEN_EOF;
          }

          m_state = STATE_EOF;
          return m_type;
        }
      }
      catch (IOException ex) {
        log.errorT("next(220)", "exception testing eof, type=" + m_type + " state=" + m_state + " token=" + m_token);
        throw ex;
      }

      // get next character
      c = getChar();

      switch (m_state) {
        case STATE_TEXT:
          if (c == '<') {
            m_state = STATE_TAG;
            if (m_token.length() > 0) {
              return m_type = TOKEN_TEXT;
            }
          }
          else {
            m_token.append(c);
          }
          break;
        case STATE_TAG:
        {
          int buflen = m_token.length();
          if (c == '>') {
            m_state = STATE_TEXT;
            return m_type = TOKEN_TAG;
          }
          else if (c == '<' && buflen == 0) {
            // handle '<<', e.g. in a  <pre> section
            m_token.append("<<");
            m_state = STATE_TEXT;
          }
          else if (c == '-' && buflen == 2 && m_token.charAt(1) == '-' && m_token.charAt(0) == '!') {
            // handle <!--
            m_token.setLength(0);
            m_state = STATE_COMMENT;
          }
          else if (c == '\'' || c == '"') {
            // handle quotes inside tag
            m_tagquote = c;// remember the quote character!
            m_token.append(c);
            m_state = STATE_TAG_QUOTE;
          }
          else {
            m_token.append(c);
          }
        }
          break;
        case STATE_TAG_QUOTE:
        {
          // the only way out out of this state is to close the quote
          // special case: some people forget to end quote in a tag
          if (c == '>') {
            m_pushback = c;
            m_state = STATE_TAG;
          }
          else {
            m_token.append(c);
            if (c == m_tagquote) {
              m_state = STATE_TAG;
            }
          }
        }
          break;
        case STATE_COMMENT:
        {
          if (c == '>' && m_comment >= 2) {
            m_token.setLength(m_token.length() - 2);
            m_comment = 0;
            m_state = STATE_TEXT;
            return m_type = TOKEN_COMMENT;
          }
          else if (c == '-') {
            m_comment++;
          }
          else {
            m_comment = 0;
          }
          m_token.append(c);
        }
          break;
      }// switch
    }// while
  }

  /**
   * This method writes the inputstream from the current position to the
   * PrintWriter <b>without</b> any further parsing.
   *
   * @param pw TBD: Description of the incoming method parameter
   * @exception IOException Exception raised in failure situation
   */
  public void writeToStream(Writer pw)
    throws IOException {
    // int c;
    if (m_pushback != 0) {
      pw.write(m_pushback);
    }

    pw.write(m_buffer, m_bufferIndex, m_available - m_bufferIndex);
    while ((m_available = m_reader.read(m_buffer, 0, m_buffer.length)) != -1) {
      pw.write(m_buffer, 0, m_available);
    }
  }

  // --------------------------- private -----------------------------------

  private void reset() {
    m_state = STATE_TEXT;
    m_type = TOKEN_TEXT;
    m_bufferIndex = 0;
    m_bufferSize = 0;
    m_available = 0;
    m_pushback = 0;
    m_comment = 0;
    m_limitLoading = false;
  }

  private void resetReader() {
    try {
      m_reader = new InputStreamReader(m_input, getEncoding());
    }
    catch (UnsupportedEncodingException ex) {
      log.warningT("resetReader(342)", "html encoding not supported: " + getEncoding() + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(ex));
      m_reader = new InputStreamReader(m_input);
    }
  }

  private boolean eof()
    throws IOException {
    if (m_pushback != 0) {
      return false;
    }
    loadBuffer();
    return m_available < 0;
  }

  private char getChar()
    throws IOException {
    if (m_pushback != 0) {
      char c = m_pushback;
      m_pushback = 0;
      return c;
    }
    loadBuffer();
    return (m_available > 0) ? m_buffer[m_bufferIndex++] : 0;
  }

  private void loadBuffer()
    throws IOException {
    if (m_available == 0 || m_bufferIndex >= m_bufferSize) {
      if (m_limitLoading) {
        // we are limit to just loading this buffer once and never throw
        // away anything. This is look-ahead mode
        m_available = 0;
        if (m_bufferIndex < m_buffer.length) {
          while ((m_available = m_reader.read(m_buffer, m_bufferIndex,
            m_buffer.length - m_bufferIndex)) == 0) {
          }
          m_bufferSize = m_bufferIndex + m_available;
        }
        else {
          m_available = -1;
        }
      }
      else {
        m_bufferIndex = 0;
        while ((m_available = m_reader.read(m_buffer, 0, m_buffer.length)) == 0) {
        }
        m_bufferSize = m_available;
      }
    }
  }

  private void setupReader()
    throws IOException {

    // implementation note:
    // BufferedInputStream is used to enable the mark/reset methods in
    // findEncoding()! The local buffer (m_buffer) is used to speed
    // up the read operations. Is more than two times faster as reading
    // without the local buffer!!
    //
    if (!m_input.markSupported()) {
      m_input = new BufferedInputStream(m_input, 2 * INPUT_BUFFER_SIZE);
    }

    // sei: this is asking for trouble, since there is no
    // check that only this many bytes are read
    //
    m_input.mark(INPUT_BUFFER_SIZE);
    resetReader();

    m_limitLoading = true;
    // boolean changedEncoding = false;
    try {

      boolean done = false;
      while (!done) {
        next();
        int type = getTokenType();
        switch (type) {
          case TOKEN_TAG:
            String tag = getTokenContent().toLowerCase();
            if (log.beDebug()) {
              log.debugT("setupReader(424)", "find encoding tag: " + tag);
            }
            // is it a meta tag ?
            if (tag.indexOf("meta") >= 0) {
              int idx = tag.indexOf(CHARSET);
              if (idx > 0) {
                // find end of charset
                int len = tag.length();
                int i;
                idx += CHARSET.length();
                for (i = idx + 1; i < len; i++) {
                  if (tag.charAt(i) == '>' || tag.charAt(i) == '"' || tag.charAt(i) == '\'') {
                    break;
                  }
                }

                String encoding = tag.substring(idx, i);
                String mappedEncoding = (String)g_encodings.get(encoding);
                if (mappedEncoding != null) {
                  encoding = mappedEncoding;
                }
                if (encoding != null && !encoding.equals(m_encoding)) {
                  m_encoding = encoding;
                  // changedEncoding = true;
                  log.infoT("setupReader(448)", "detected encoding: " + m_encoding);
                }

                done = true;
              }
            }
            else if (tag.indexOf("/head") >= 0 || tag.indexOf("body") >= 0) {
              // no encoding defined in HTML header, use default
              done = true;
            }
            break;
          case TOKEN_EOF:
            done = true;
            break;
          default:
            break;
        }
      }
    }
    catch (IOException ex) {
      log.warningT("setupReader(468)", "error finding encoding information" + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(ex));
    }
    finally {

      reset();
      m_input.reset();
      resetReader();
    }
  }
}
