/*
 * Copyright (c) 2003 by SAP AG. All Rights Reserved.
 *
 * SAP, mySAP, mySAP.com and other SAP products and
 * services mentioned herein as well as their respective
 * logos are trademarks or registered trademarks of
 * SAP AG in Germany and in several other countries all
 * over the world. MarketSet and Enterprise Buyer are
 * jointly owned trademarks of SAP AG and Commerce One.
 * All other product and service names mentioned are
 * trademarks of their respective companies.
 *
 * @version $Id$
 */

package com.sapportals.wcm.util.html;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

/**
 * Reads HTML documents and generates events. <p>
 *
 * The IHTMLReader generates events for HTML documents. Events are sent to the
 * {@link com.sapportals.wcm.util.html.IHTMLContentHandler}. There can be only
 * one content handler per reader. <p>
 *
 * A document is parsed by first setting the input source and then calling
 * {@link #parse()} once or {@link #parseNextEvent()} repeatedly. {@link #parseNextEvent()} 
 * parses the document until the next event was sent to the
 * content handler and then returns to the caller. It is not garantueed that
 * exactly one event is generated. <p>
 *
 * Input Sources and Encodings:
 * <ul>
 *   <li> When a {@link java.io.Reader} is used as input source, no attempt is
 *   made to detect the encoding of the html document. {@link #getEncoding()} will
 *   return <code>null</code> in that case. </li>
 *   <li> When a {@link java.io.InputStream} together with an encoding is used,
 *   any encoding specified in <code>meta</code> tags of the document is ignored
 *   and the given encoding is used. </li>
 *   <li> When a {@link java.io.InputStream} together without encoding is used,
 *   the reader will look into the first n octets of the html document to detect
 *   a HTML <code>meta</code> tag with Content-Type which specifies the
 *   character set to use. If no encoding is found, ISO-8859-1 is assumed. The
 *   number of octets used for encoding detection is implementation defined.
 *   </li>
 * </ul>
 * <p>
 *
 * Note that implementations of this class are <b>not multithread-safe</b> . <p>
 *
 * Copyright (c) SAP AG 2001-2002
 *
 * @author stefan.eissing@greenbytes.de
 * @version $Id: IHTMLReader.java,v 1.3 2002/04/26 13:59:02 sei Exp $
 */
public interface IHTMLReader {

  /**
   * Get the registered content handler. Returns <code>null</code> if none is
   * installed.
   *
   * @return registered content handler
   */
  public IHTMLContentHandler getContentHandler();

  /**
   * Set the content handler to a new value. <code>null</code> is allowed to
   * deregister an installed handler.
   *
   * @param handler to register
   */
  public void setContentHandler(IHTMLContentHandler handler);

  /**
   * Return the encoding used in the document.
   *
   * @return encoding used in document or <code>null</code> if unknown.
   * @throws HTMLException when document is not legal HTML
   * @throws IOException on read errors
   */
  public String getEncoding()
    throws HTMLException, IOException;

  /**
   * Set InputStream as document source. Encoding will be detected.
   *
   * @param input stream to read document from
   * @throws HTMLException when document is not legal HTML
   * @throws IOException on read errors
   */
  public void setSource(InputStream input)
    throws HTMLException, IOException;

  /**
   * Set InputStream as document source, use the given encoding.
   *
   * @param input stream to read document from
   * @param encoding to use for stream
   * @throws HTMLException when document is not legal HTML
   * @throws IOException on read errors
   */
  public void setSource(InputStream input, String encoding)
    throws HTMLException, IOException;

  /**
   * Set Reader as document source, encoding is irrelevant.
   *
   * @param input to read document from
   * @throws HTMLException when document is not legal HTML
   * @throws IOException on read errors
   */
  public void setSource(Reader input)
    throws HTMLException, IOException;

  /**
   * Parse the complete document, generating events, until the source is read
   * emtpy.
   *
   * @throws HTMLException when document is not legal HTML
   * @throws IOException on read errors
   */
  public void parse()
    throws HTMLException, IOException;

  /**
   * Parse the document, generating an events, and return to the caller. Will
   * return true as long as there are more events to read.
   *
   * @return if there are more events to read
   * @throws HTMLException when document is not legal HTML
   * @throws IOException on read errors
   */
  public boolean parseNextEvent()
    throws HTMLException, IOException;

  /**
   * Free all allocated resources. Not necessary to call when parsing has
   * finished.
   */
  public void discard();
}
