/*
 * Copyright (c) 2003 by SAP AG. All Rights Reserved.
 *
 * SAP, mySAP, mySAP.com and other SAP products and
 * services mentioned herein as well as their respective
 * logos are trademarks or registered trademarks of
 * SAP AG in Germany and in several other countries all
 * over the world. MarketSet and Enterprise Buyer are
 * jointly owned trademarks of SAP AG and Commerce One.
 * All other product and service names mentioned are
 * trademarks of their respective companies.
 *
 * @version $Id$
 */

package com.sapportals.wcm.util.html;

import com.sap.tc.logging.Location;
import com.sapportals.wcm.util.http.ContentType;

import java.io.*;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

/**
 * Internal class providing input to a {@link IHTMLReader}. <p>
 *
 * HTMLInput handles two tasks:
 * <ul>
 *   <li> Detection of character encoding in the input document.</li>
 *   <li> Reading of characters with pushback up to buffer capacity.</li>
 * </ul>
 * See {@link HTMLReaderImpl} for more information about implementation details
 * and limitations. <p>
 *
 * Copyright (c) SAP AG 2001-2003
 *
 * @author stefan.eissing@greenbytes.de
 * @version $Id: HTMLInput.java,v 1.17 2003/03/20 18:41:30 jre Exp $
 */
final class HTMLInput {

  private final static Location log = Location.getLocation(com.sapportals.wcm.util.html.HTMLInput.class);

  /**
   * Size of Buffer used for reading events.
   */
  public final static int INPUT_BUFFER_SIZE = 16 * 1024;

  /**
   * Maximum number of characters used in character encoding detection.
   */
  public final static int INPUT_ENCODING_LOOKAHEAD_CHARS = INPUT_BUFFER_SIZE / 2;

  /**
   * Number of octets used at max to detect character encoding
   */
  private final static int INPUT_ENCODING_LOOKAHEAD_BYTES = 2 * INPUT_BUFFER_SIZE;

  private final static int NO_LIMIT = -1;

  private final static String DEFAULT_ENCODING = "ISO-8859-1";
  private final static String CHARSET_KEY = "charset=";

  private final static Set WCHAR_ENCODINGS;
  static {
    Set set = new HashSet();
    set.add("utf-16");
    set.add("utf-16be");
    set.add("utf-16le");
    set.add("unicode");
    set.add("unicodebig");
    set.add("unicodebigunmarked");
    set.add("unicodelittle");
    set.add("unicodelittleunmarked");
    WCHAR_ENCODINGS = Collections.unmodifiableSet(set);
  }
  
  private static boolean isWideCharacterEncoding(String encoding) {
    if (encoding != null) {
      return WCHAR_ENCODINGS.contains(encoding.toLowerCase());
    }
    return false;
  }

  private Reader m_input;
  private InputStream m_is;
  private String m_encoding;
  private boolean m_eof;

  private final char[] m_buffer;
  private int m_mark;
  private int m_last;
  private int m_index;
  private int m_len;
  private int m_readLimit = NO_LIMIT;

  private int m_charsRead;
  private boolean m_continueLooking;

  /**
   * Input from plain InputStream. Encoding detection necessary.
   *
   * @param input TBD: Description of the incoming method parameter
   * @exception IOException Exception raised in failure situation
   */
  public HTMLInput(InputStream input)
    throws IOException {
    m_buffer = new char[INPUT_BUFFER_SIZE];
    m_is = input;

  }

  /**
   * Input from InputStream with known encoding.
   *
   * @param input TBD: Description of the incoming method parameter
   * @param encoding TBD: Description of the incoming method parameter
   * @exception IOException Exception raised in failure situation
   */
  public HTMLInput(InputStream input, String encoding)
    throws IOException {
    m_buffer = new char[INPUT_BUFFER_SIZE];
    m_is = input;
    m_encoding = ContentType.mapEncoding(encoding);
    m_input = new InputStreamReader(m_is, m_encoding);
  }

  /**
   * Input from Reader. Encoding detection not needed.
   *
   * @param input TBD: Description of the incoming method parameter
   * @exception IOException Exception raised in failure situation
   */
  public HTMLInput(Reader input)
    throws IOException {
    m_buffer = new char[INPUT_BUFFER_SIZE];
    m_is = null;
    m_input = input;
    if (m_input instanceof InputStreamReader) {
      m_encoding = ((InputStreamReader)m_input).getEncoding();
    }
  }

  /**
   * Initialize input. Will try to detect character encoding if necessary.
   *
   * @param reader TBD: Description of the incoming method parameter
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  public void init(IHTMLReader reader)
    throws HTMLException, IOException {
    if (m_input == null) {
      findEncoding(reader);
    }
  }

  /**
   * Get encoding from intialized input.
   *
   * @return character encoding or <code>null</code> if unknown.
   * @exception IOException Exception raised in failure situation
   */
  public String getEncoding()
    throws IOException {
    return m_encoding;
  }

  /**
   * Return of end of input has been reached.
   *
   * @return if end of input has been reached.
   */
  public boolean eof() {
    return m_eof;
  }

  /**
   * Reset input to its initial state.
   */
  public void reset() {
    m_eof = false;
    m_mark = 0;
    m_index = 0;
    m_len = 0;
    m_last = 0;
    m_readLimit = NO_LIMIT;
    m_charsRead = 0;
  }

  /**
   * Characters currently in buffer, up to the read index, can be discarded when
   * necessary.
   */
  public void markAsRead() {
    m_mark = m_index;
    m_len = 0;
  }

  /**
   * Read characters until end character is found or eof is reached.
   *
   * @param end TBD: Description of the incoming method parameter
   * @return if any characters have been read.
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public boolean readUntil(char end)
    throws IOException, HTMLBufferOverrunException {
    int read = 0;
    int c;
    while ((c = next()) != -1 && c != end) {
      ++read;
    }
    pushBack();
    return read > 0;
  }

  /**
   * Read characters until non-whitespace character or eof is reached.
   *
   * @return if any characters have been read.
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public boolean readWS()
    throws IOException, HTMLBufferOverrunException {
    int read = 0;
    int c;
    while ((c = next()) != -1 && Character.isWhitespace((char)c)) {
      ++read;
    }
    pushBack();
    return read > 0;
  }

  /**
   * Read characters until whitespace character or eof is reached.
   *
   * @return if any characters have been read.
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public boolean readNonWS()
    throws IOException, HTMLBufferOverrunException {
    int read = 0;
    int c;
    while ((c = next()) != -1
       && !Character.isWhitespace((char)c)) {
      ++read;
    }
    pushBack();
    return read > 0;
  }

  /**
   * Read characters until whitespace or end character or eof is reached.
   *
   * @param end TBD: Description of the incoming method parameter
   * @return if any characters have been read.
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public boolean readNonWSUntil(char end)
    throws IOException, HTMLBufferOverrunException {
    int read = 0;
    int c;
    while ((c = next()) != -1
       && c != end
       && !Character.isWhitespace((char)c)) {
      ++read;
    }
    pushBack();
    return read > 0;
  }

  /**
   * Read characters until whitespace or end1 or end2 character or eof is
   * reached.
   *
   * @param end1 TBD: Description of the incoming method parameter
   * @param end2 TBD: Description of the incoming method parameter
   * @return if any characters have been read.
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public boolean readNonWSUntil(char end1, char end2)
    throws IOException, HTMLBufferOverrunException {
    int read = 0;
    int c;
    while ((c = next()) != -1
       && c != end1
       && c != end2
       && !Character.isWhitespace((char)c)) {
      ++read;
    }
    pushBack();
    return read > 0;
  }

  /**
   * Read characters until whitespace or any of the characters in endChars or
   * eof is reached.
   *
   * @param endChars TBD: Description of the incoming method parameter
   * @return if any characters have been read.
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public boolean readNonWSUntil(String endChars)
    throws IOException, HTMLBufferOverrunException {
    int read = 0;
    int c;
    while ((c = next()) != -1
       && (endChars.indexOf(c) < 0)
       && !Character.isWhitespace((char)c)) {
      ++read;
    }
    pushBack();
    return read > 0;
  }

  /**
   * Match the given string (case-sensitive) by reading characters.
   *
   * @param s TBD: Description of the incoming method parameter
   * @return if string did match.
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public boolean match(String s)
    throws IOException, HTMLBufferOverrunException {
    for (int i = 0; i < s.length(); ++i) {
      if (next() != s.charAt(i)) {
        for (; i >= 0; --i) {
          pushBack();
        }
        return false;
      }
    }
    return true;
  }

  /**
   * Match the given string (non-case-sensitive) by reading characters.
   *
   * @param s TBD: Description of the incoming method parameter
   * @return if string did match.
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public boolean matchIgnoreCase(String s)
    throws IOException, HTMLBufferOverrunException {
    for (int i = 0; i < s.length(); ++i) {
      int c = next();
      if (c == -1) {
        return false;
      }
      char c1 = s.charAt(i);
      char c2 = (char)c;
      if (c1 != c2
         && Character.toUpperCase(c1) != Character.toUpperCase(c2)
         && Character.toLowerCase(c1) != Character.toLowerCase(c2)) {
        return false;
      }
    }
    return true;
  }

  /**
   * Read next character.
   *
   * @return character or -1 for end of input
   * @exception IOException Exception raised in failure situation
   * @exception HTMLBufferOverrunException Exception raised in failure situation
   */
  public int next()
    throws IOException, HTMLBufferOverrunException {
    if (m_index < m_last) {
      ++m_len;
      return m_buffer[m_index++];
    }

    fillBuffer();
    if (m_index < m_last) {
      ++m_len;
      return m_buffer[m_index++];
    }
    else {
      // no more chars available
      if (m_eof) {
        ++m_index;
        return -1;
      }
      else {
        // the buffer is full
        throw new HTMLBufferOverrunException("buffer is full");
      }
    }
  }

  /**
   * Pushback the last read character. Can be called as often as successfull
   * characters have been read, up to the last mark position.
   */
  public void pushBack() {
    if (m_index > m_mark) {
      if (m_index <= m_last) {
        --m_len;
      }
      --m_index;
    }
    else {
      log.warningT("pushBack(392)", "pushback beyond start of buffer");
    }
  }

  /**
   * Copy the read characters (from last mark to current index) into the given
   * buffer.
   *
   * @param buffer to copy chars to
   * @param offset in buffer where to start
   * @return number of characters copied.
   */
  public int copyTo(char[] buffer, int offset) {
    int len = getLength();
    if (len > 0) {
      System.arraycopy(m_buffer, m_mark, buffer, offset, len);
    }
    return len;
  }

  /**
   * Return position in internal buffer, where currently read characters start.
   *
   * @return buffer position where characters start.
   */
  public int getOffset() {
    return m_mark;
  }

  /**
   * Return amount of characters read. This is the number of characters that
   * will be copied with copyTo().
   *
   * @return number of characters read
   */
  public int getLength() {
    return m_len;
  }

  /**
   * Get the read character at index. This is independant from the offset in the
   * internal buffer. The first read char is always at index 0. The last at
   * getLength() - 1.
   *
   * @param index to get char from
   * @return character at index
   */
  public char charAt(int index) {
    index += m_mark;
    if (index < m_mark || index >= m_index) {
      throw new IndexOutOfBoundsException();
    }
    return m_buffer[index];
  }

  /**
   * Get the internal character array used for reading.
   *
   * @return buffer
   */
  public char[] getBuffer() {
    return m_buffer;
  }

  /**
   * Return a string with location information, useful in debug output.
   *
   * @return location information
   */
  public String location() {
    int index = m_index;
    if (index >= m_last) {
      index = m_last;
    }
    int start = index - 10;
    if (start < 0) {
      start = 0;
    }
    int end = index + 10;
    if (end > m_last) {
      end = m_last;
    }
    StringBuffer sb = new StringBuffer();
    sb.append(m_buffer, start, index - start);
    sb.append("(==>)");
    if (index == end) {
      sb.append("EOF");
    }
    else {
      sb.append(m_buffer, index, end - index);
    }
    return sb.toString();
  }

  /**
   * Dispose of all allocated resources.
   */
  public void discard() {
    if (m_input != null) {
      try {
        m_input.close();
        m_input = null;
      }
      catch (IOException e) {
        // ignore
        log.debugT("discard(497)", "discard input reader" + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(e));
      }
    }

    if (m_is != null) {
      try {
        m_is.close();
        m_is = null;
      }
      catch (IOException e) {
        // ignore
        log.debugT("discard(508)", "discard inputstream" + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(e));
      }
    }
  }

  // ----------------------- private -----------------------------------------

  /**
   * Tries to kick characters marked as obsolete out of the internal buffer,
   * making room for more input characters to read.
   */
  private void discardReadChars() {
    if (m_mark > 0) {
      if (m_mark >= m_last) {
        m_mark = m_last = m_index = m_len = 0;
      }
      else {
        m_last -= m_mark;
        System.arraycopy(m_buffer, m_mark, m_buffer, 0, m_last);
        m_index -= m_mark;
        m_mark = 0;
      }
    }
  }

  /**
   * Fill buffer with more chars to read.
   *
   * @exception IOException Exception raised in failure situation
   */
  private void fillBuffer()
    throws IOException {
    if (m_eof) {
      return;
    }

    if (m_index >= m_last) {
      // buffer is read to max, when
      //
      int read = -1;
      if (m_readLimit == NO_LIMIT) {
        do {
          int max = m_buffer.length - m_last;
          if (max <= 0) {
            // buffer is full, try to throw away chars marked as read.
            // If that does not help, give up. Caller knows how to handle
            // this case
            //
            if (m_mark <= 0) {
              return;
            }
            discardReadChars();
            max = m_buffer.length - m_last;
          }

          read = m_input.read(m_buffer, m_last, max);
        } while (read == 0);

        if (log.beDebug()) {
          StringBuffer sb = new StringBuffer(m_buffer.length + 100);
          if (read > 0)  {
            sb.append("input buffer is:\n").append(m_buffer, 0, m_last+read);
          }
          else {
            sb.append("end of input reached, current buffer offset ").append(m_last);
            sb.append(", buffer is:\n").append(m_buffer);
          }
          log.debugT("fillBuffer(569)", sb.toString());
        }
      }
      else if (m_readLimit > 0) {
        do {
          read = m_input.read(m_buffer, m_last, m_readLimit);
        } while (read == 0);

        if (read > 0) {
          m_readLimit -= read;
          if (m_readLimit < 0) {
            m_readLimit = 0;
          }
        }
      }

      if (read == -1) {
        m_eof = true;
        if (m_readLimit == NO_LIMIT) {
          try {
            m_input.close();
            if (m_is != null) {
              m_is.close();
              m_is = null;
            }
          }
          catch (IOException ex) {
            log.warningT("fillBuffer(596)", "closing input" + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(ex));
          }
        }
      }
      else {
        m_charsRead += read;
        m_last += read;
      }
    }
  }

  /**
   * Encoding detection. This is how it works: - Make sure that the InputStream
   * we read from supports mark. If not put a BufferedInputStream on top of it
   * which does. - Save the old content handler from our IHTMLReader - Install
   * our own content handler which just does encoding detection. - Tell reader
   * to parse events until - we found the encoding in a meta tag or - we
   * encountered a <body> tag or - we have exhausted the buffer - If no encoding
   * has been detected, assume DEFAULT_ENCODING - setup the internal Reader with
   * new encoding - restore the old content handler in the reader
   *
   * @param reader TBD: Description of the incoming method parameter
   * @exception HTMLException Exception raised in failure situation
   * @exception IOException Exception raised in failure situation
   */
  private void findEncoding(IHTMLReader reader)
    throws HTMLException, IOException {
    if (m_is == null) {
      throw new HTMLException("no inputstream defined to read from");
    }
    if (m_encoding != null) {
      throw new HTMLException("encoding already defined: " + m_encoding);
    }

    if (!m_is.markSupported()) {
      if (log.beDebug()) {
        log.debugT("detect encoding, mark not supported, wrapping input");
      }
      m_is = new BufferedInputStream(m_is, INPUT_ENCODING_LOOKAHEAD_BYTES);
    }

    this.reset();
    m_input = new InputStreamReader(m_is, DEFAULT_ENCODING);
    m_is.mark(INPUT_ENCODING_LOOKAHEAD_BYTES);
    m_readLimit = INPUT_ENCODING_LOOKAHEAD_CHARS;

    IHTMLContentHandler oldHandler = reader.getContentHandler();
    try {
      IHTMLContentHandler myHandler = new EncodingHandler();
      reader.setContentHandler(myHandler);

      m_continueLooking = true;
      while (m_continueLooking && reader.parseNextEvent()) {
        // loop
      }
    }
    catch (HTMLException ex) {
      if (log.beDebug()) {
        log.debugT("findEncoding(651)", "during encoding search" + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(ex));
      }
      // ignored
    }
    finally {
      reader.setContentHandler(oldHandler);
      if (m_encoding == null) {
        m_encoding = DEFAULT_ENCODING;
      }
      resetInput();
    }
  }

  private void resetInput()
    throws IOException {
    try {
      m_is.reset();
      this.reset();
      m_input = new InputStreamReader(m_is, m_encoding);
    }
    catch (UnsupportedEncodingException ex) {
      log.warningT("resetInput(672)", "html encoding not supported: " + m_encoding + " - " + com.sapportals.wcm.util.logging.LoggingFormatter.extractCallstack(ex));
      m_input = new InputStreamReader(m_is);
    }
  }

  /**
   * TBD: Description of the class.
   */
  private class EncodingHandler implements IHTMLContentHandler {

    public EncodingHandler() { }

    public void characters(char[] ch, int start, int length)
      throws HTMLException { }

    public void endDocument()
      throws HTMLException { }

    public void endElement(IHTMLElement element)
      throws HTMLException { }

    public void startDocument()
      throws HTMLException { }

    public void startElement(IHTMLElementStart element)
      throws HTMLException {
      // Find out if this is a meta tag with Content-Type and look if the
      // charset is defined in the content
      //
      int nlen = element.getNameLength();
      if (nlen != 4) {
        return;
      }

      if (!element.hasName("meta")) {
        if (element.hasName("body")) {
          m_continueLooking = false;
        }
        return;
      }

      int index = element.getIndexOf("http-equiv");
      if (index < 0 || !element.getValueOf(index).equalsIgnoreCase("Content-Type")) {
        return;
      }

      // We found the content-type
      //
      m_continueLooking = false;
      if ((index = element.getIndexOf("content")) < 0) {
        return;
      }

      String mime = element.getValueOf(index);
      index = mime.indexOf(CHARSET_KEY);
      if (index < 0) {
        return;
      }

      index += CHARSET_KEY.length();
      int end = index;
      int len = mime.length();
      for (; end < len; ++end) {
        if (Character.isWhitespace(mime.charAt(end))) {
          break;
        }
      }
      --end;
      if (end > index) {
        String encoding = mime.substring(index, end + 1);
        m_encoding = ContentType.mapEncoding(encoding);
        // stefan.eissing@greenbytes.de: well we got an encoding here, but can
        // we really believe our luck? There are HTML pages that lie about it.
        // One thing is for sure: if this page tells us that it's encoding is
        // a 16 (or more?) bit character set, it has to be cheating. How could
        // we have read the page so far in out 8-bit DEFAULT_ENCODING if the
        // characters use more than 8 bit? Seems impossible.
        // We therefore reject 16-bit encodings (but not multibyte characters
        // like UTF-8 of course)...
        if (log.beDebug()) {
          log.debugT("startElement(744)", "detected encoding: " + m_encoding);
        }
        if (isWideCharacterEncoding(m_encoding)) {
          if (log.beDebug()) {
            log.debugT("startElement(744)", "rejected encoding: " + m_encoding);
          }
          m_encoding = null;
        }
      }
    }
  }

}
