/*
 * Copyright (c) 2003 by SAP AG. All Rights Reserved.
 *
 * SAP, mySAP, mySAP.com and other SAP products and
 * services mentioned herein as well as their respective
 * logos are trademarks or registered trademarks of
 * SAP AG in Germany and in several other countries all
 * over the world. MarketSet and Enterprise Buyer are
 * jointly owned trademarks of SAP AG and Commerce One.
 * All other product and service names mentioned are
 * trademarks of their respective companies.
 *
 * @version $Id$
 */

package com.sapportals.wcm.util.string;

import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.File;
import java.io.FileWriter;
import java.io.FilterReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.util.Vector;

/**
 * Title: Strip HTML tags from a Reader Description: It works only with proper
 * tags, so character references needs to be converted before applying this
 * Filter, see <code>ReplaceHTMLTokens</code> <p>
 *
 * Usage Example:<p>
 *
 * <pre><code>in = new BufferedReader(
 * new ReplaceTokens(
 * new RemoveHTMLTags(
 * new CharArrayReader(content.toCharArray()))));
 * </code> </pre> Copyright (c) SAP AG 2001-2002 Company: SAP AG
 *
 * @author corneliu.mitu@sap.com
 * @version 1.0
 */

public class RemoveHTMLTags extends FilterReader {
  /**
   * boolean inTag
   */
  boolean inTag = false;
  private int _buf_location;
  private Hashtable _tags;
  private Hashtable _entities;
  private String _buf;
  private boolean _capture_head;
  private boolean _noSpace;
  private int _lastChar;
  private StringBuffer _head;
  private Vector _headers;
  private boolean _noCR;
  public static String newline = System.getProperty("line.separator");

  /**
   * <init>
   *
   * @param in parameter for <init>
   */
  public RemoveHTMLTags(Reader in) {
    super(in);
    _tags = new Hashtable();
    _entities = new Hashtable();
    _buf_location = -1;
    _headers = new Vector();
    _capture_head = false;
    _noCR = false;
    _noSpace = false;

    this.useStandardEntities();
    this.useStandardSubstitutes();

  }

  /**
   * read
   *
   * @param len parameter for read
   * @param b TBD: Description of the incoming method parameter
   * @param off TBD: Description of the incoming method parameter
   * @return the returned int
   * @throws IOException -
   */
  public int read(char b[], int off, int len)
    throws IOException {
    int l = 0;
    int k;
    if ((k = read()) == -1) {
      return -1;
    }
    else {
      b[off + l++] = (char)k;
    }
    while ((k = read()) != -1 && l < len - 1) {
      b[off + l++] = (char)k;
    }
    return l;
  }


  /**
   * Reads a char. Will block if no input is available.
   *
   * @return the char read, or -1 if the end of the stream is reached.
   * @throws IOException If an I/O error has occurred.
   * @see java.io.InputStream#read()
   */
  public int read()
    throws IOException {
    if (_buf_location >= 0 && _buf_location < _buf.length()) {
      return _buf.charAt(_buf_location++);
    }
    int i;
    while ((i = super.in.read()) > 0) {
      if (i == 60) {// < char
        String s = readTagword();
        parse(s);
        if (_buf_location >= 0 && _buf_location < _buf.length()) {
          return _buf.charAt(_buf_location++);
        }
      }
      else if (i == 38) {// & char
        readEntity();
        if (_buf_location >= 0 && _buf_location < _buf.length()) {
          return _buf.charAt(_buf_location++);
        }
      }
      else {
        if ((i == 10 || i == 13) && _noCR) {//eol
          i = 32;
        }
        if (i != 32 || _lastChar != 32 || !_noSpace) {//space
          _lastChar = i;
          return i;
        }
      }
    }
    _lastChar = i;
    return i;
  }

  private void parse(String s)
    throws IllegalArgumentException {
    StringTokenizer stringtokenizer = new StringTokenizer(s);
    if (!stringtokenizer.hasMoreTokens()) {
      throw new IllegalArgumentException("No Tags in String");
    }
    String s1;
    if ((s1 = stringtokenizer.nextToken()) == null) {
      return;
    }
    s1 = s1.toUpperCase();
    /*
     * if (s1.startsWith("H") && Character.isDigit(s1.charAt(1)))
     * _capture_head = true;
     * if (s1.startsWith("/H") && Character.isDigit(s1.charAt(2))) {
     * _capture_head = false;
     * _headers.addElement(_head.toString());
     * _head = new StringBuffer();
     * }
     */
    if ((_buf = (String)_tags.get(s1)) != null) {
      _buf_location = 0;
      return;
    }
    else {
      _buf_location = -1;
      return;
    }
  }

  /**
   * Defines an entity substitution set. The user should only use the base tag
   * definition and not include the & and ; characters.
   *
   * @param entity - the entity to substitute. Example: quot, #90, copy.
   * @param substitute - string to sub for the entity.
   * @see #useStandardEntities()
   * @throws NullPointerException
   */
  public void addEntity(String entity, String substitute)
    throws NullPointerException {
    _entities.put(
      (new String(entity)).toUpperCase(),
      new String(substitute));
  }


  /**
   * Defines a tag substitution set. The user should only use the base tag
   * definition (when substituting, attributes are ignored anyway). For example,
   * addSubstitute("hr", "-=-=-=") will substitute any HR tag found (even with
   * attributes) with the -=-=-= characters. Note: beginning and ending need
   * their own definitions
   *
   * @param tag - the tag to substitute. Use base tag without attributes.
   * @param substitute - string to sub for tag.
   * @see #useStandardSubstitutes()
   * @throws NullPointerException
   */
  public void addSubstitute(String tag, String substitute)
    throws NullPointerException {
    _tags.put((new String(tag)).toUpperCase(), new String(substitute));
  }

  private String readTagword() {
    StringBuffer stringbuffer = new StringBuffer();
    for (int i = 0; i != -1; ) {
      try {
        i = super.in.read();
      }
      catch (IOException ioexception) {
            //$JL-EXC$        
        ; //System.out.println("Error readtagword: " + ioexception);
      }
      if (i == 62) {
        break;
      }
      stringbuffer.append((char)i);
    }
    return stringbuffer.toString();
  }

  private void readEntity() {
    StringBuffer stringbuffer = new StringBuffer();
    for (int i = 0; i != -1; ) {
      try {
        i = super.in.read();
      }
      catch (IOException ioexception) {
            //$JL-EXC$        
        ; //System.out.println("Error readtagword: " + ioexception);
      }
      if (i == 59) {
        break;
      }
      stringbuffer.append((char)i);
    }

    if ((_buf =
      (String)_entities.get(stringbuffer.toString().toUpperCase()))
       != null) {
      _buf_location = 0;
      return;
    }
    else {
      _buf_location = -1;
      return;
    }
  }

  /**
   * Tells the filter to set up a standard tag substitution set.
   */
  public synchronized void useStandardSubstitutes() {
    //addSubstitute("title", "Title: ");
    //addSubstitute("/title", "\n\n");
    addSubstitute("p", "\n");
    addSubstitute("br", "\n");
    //addSubstitute("h1", "\n");
    addSubstitute("/h1", "\n");
    //addSubstitute("h2", "\n");
    addSubstitute("/h2", "\n");
    //addSubstitute("h3", "\n");
    addSubstitute("/h3", "\n");
    addSubstitute("/h4", "\n");
    addSubstitute("/h5", "\n");
    addSubstitute("/h6", "\n");
    //addSubstitute("img", "{image}");
    //addSubstitute("hr", "\n-------------------------\n");
    //addSubstitute("ol", "\n");
    //addSubstitute("li", "\n   * ");
    //addSubstitute("/ol", "\n");
    //addSubstitute("ul", "\n");
    //addSubstitute("/ul", "\n");
  }

  /**
   * Tells the filter to set up a standard entity substitution set. This is
   * relatively limited and the default set is shown below: / Original
   * Substitution / quot " / amp & / lt < / gt > / copy (C)
   */
  public synchronized void useStandardEntities() {
    addEntity("quot", "\"");
    addEntity("amp", "&");
    addEntity("lt", "<");
    addEntity("gt", ">");
    addEntity("copy", "(C)");
  }

  /**
   * Tells the filter to clear all current tag substitutions.
   */
  public synchronized void clearSubstitutes() {
    _tags.clear();
  }

  /**
   * Tells the filter to clear all current entity substitutions
   */
  public synchronized void clearEntities() {
    _entities.clear();
  }

  /**
   * Define whether multiple spaces should be compressed into a single space.
   * This does not affect other "white space" characters.
   *
   * @param flag
   */
  public synchronized void setCompressSpace(boolean flag) {
    _noSpace = flag;
  }

  /**
   * You can tell the filter to remove carraige returns (\n and \r) when
   * parsing. This allows the caller to insert CR's where desired since CR's in
   * HTML do not confer any special meaning
   *
   * @param flag - remove carraige returns during parse and substitue spaces
   */
  public synchronized void setNoCR(boolean flag) {
    _noCR = flag;
  }

  /**
   * You can tell the filter to remove carraige returns (\n and \r) when
   * parsing. This allows the caller to insert CR's where desired since CR's in
   * HTML do not confer any special meaning
   *
   * @return boolean
   */
  public synchronized boolean getNoCR() {
    return _noCR;
  }

  /**
   * The main program for test
   *
   * @param args The command line arguments
   */
  public static void main(String[] args) {

    String line = null;
    testStripString();

//			try {
//				//point to a http address or a local html file
//				URL url = new URL ("file:/c:/java/javaprojects/httmlstrip/test4.html");
//
//				/*BufferedReader input = new BufferedReader(
//																				new InputStreamReader(
//																				url.openStream()));
//				String inputL;
//				while ((inputL = input.readLine()) != null)
//						System.out.println(inputL);
//				input.close();
//
//				InputStreamReader in = new InputStreamReader (
//						url.openStream());
//				String inputLine;
//				StringBuffer temp = new StringBuffer(1000);
//				RemoveHTMLTags tags = new RemoveHTMLTags(new ReplaceHTMLTokens(in));
//				tags.addSubstitute("h1","REPLACED");
//				BufferedReader input1 = new BufferedReader (tags);
//				while ( (line = input1.readLine()) != null) {
//								System.out.println(line);
//								temp= temp.append(line);
//								//temp.append(newline);
//							}
//				tags.close();
//				System.out.println("As string:"+temp.toString());
//				*/
//				//in a file
//				char[] buf = new char[30];
//				InputStreamReader in2 = new InputStreamReader (
//										url.openStream());
//				RemoveHTMLTags tags2 = new RemoveHTMLTags(new ReplaceHTMLTokens(in2));
//				tags2.setCompressSpace(true);
//				tags2.setNoCR(true);
//				BufferedReader input2 = new BufferedReader(tags2);
//				File fs = new File("c:/java/javaprojects/httmlstrip/test4.txt");
//				FileWriter fw = new FileWriter(fs);
//				int number = -1;
//				while ((number =input2.read(buf)) != -1) {
//					fw.write(buf,0,number);
//				}
//				//fw.flush();
//				fw.close();
//
//			}
//			catch (Exception e) {
//				e.printStackTrace();
//			}
  }

  private static void testStripString() {

    String testString = "<HTML><BODY>123456</BODY></HTML>";
    Reader im2 = new CharArrayReader(testString.toCharArray());
    char[] buff = new char[50];
    String line = "";
    int no = 0;
    try {

      BufferedReader in2 = new BufferedReader(new RemoveHTMLTags(new ReplaceHTMLTokens(im2)));
      while ((no = in2.read(buff, 0, 50)) != -1) {
        line = new String(buff, 0, no);
        if (line.length() >= 50) {
          line += "...";
        }
      }

    }
    catch (IOException ex) {
            //$JL-EXC$      
      ;
    }

    System.out.println(line);
    //tags2.setCompressSpace(true);
    //tags2.setNoCR(true);
//			int nextChar;
//			try{
//
//			while ( ( nextChar = tags2.read() ) != -1  ) {
//			     System.out.print(Character.toUpperCase( (char) nextChar ) );
//			}
//			} catch (IOException ex) {
//			}
//			System.out.println( '\n' );
    // not needed before a close
  }
}
