/* SGML TAG scanner */

import java.util.*;
import java.io.*;

public class htmlscanner
{

 /* buffer */
 public final static int BUFFERSIZE=4096;
 public final static String KSTART="!--";
 public final static String KEND="-->";

 public final static String CONTENT="<content>";

 private transient byte buf[];
 private transient int bpos;
 private transient int bsize;

 private InputStream is;
 private OutputStream os;
 private int bytes;

 htmlscanner(InputStream is,OutputStream os)
 {
  this.is=is;
  this.os=os;
  bytes=0;
 }

  public final Hashtable getElement() throws IOException
  {
    Hashtable result;
    int spos=0;
    byte b;
    String data;

    result=new Hashtable();
    /* najit zacatek tagu tzn. < */

    data=findByte((byte)0x3c);
    if (data.length() > 0)
    {
       result.put(CONTENT,data);
    }

    /* mame zacatek tagu nyni to muze byt budto komentar, nebo klasicky
    tag */
    b=readByte();

    StringBuffer tag,tagv;
    String tags;
    tag=new StringBuffer();

    /* preskocit mozne mezery */
    while(Character.isSpace((char)b))
     b=readByte();

    /* get tag */
    while(true)
    {
     if(Character.isSpace((char)b)) break;
     if(b=='>')
      {
       /* konec maskarady */
       tags=tag.toString().toUpperCase();
       if(tags.startsWith(KSTART)) break;
       result.put("",tags);
       return result;
      }
     tag.append((char)b);
     b=readByte();
    }
    tags=tag.toString().toUpperCase();
    /* mame tag! */
    result.put("",tags);

    if(tags.startsWith(KSTART)) {
                                  if(tags.endsWith("--")
                                    &&  b=='>'
                                    &&  tags.length()>3
                                    ) return result;

                                  findEndComment();
                                  return result;
                                  }
    /* ---------------  */
    /* get XX=YY pairs! */

    while(true)
    {
     tag=new StringBuffer();
     tagv=null;
     /* preskocit mozne mezery */
      while(Character.isSpace((char)b))
        b=readByte();

    /* nacitat do mezery nebo do = */
    while(true)
    {
     if(Character.isSpace((char)b)) break;
     if(b=='>') {
                  tags=tag.toString().toUpperCase();
                  if(tags.length()==0) return result;
                  result.put(tags,"");
                  return result;
                }
    if(b=='=') { tagv=new StringBuffer();break;}
    tag.append((char)b);
    b=readByte();

    } /* get first part of XXXX = YYYY */

     /* preskocit mozne mezery */
      while(Character.isSpace((char)b))
        b=readByte();

     if(b=='>') {
                  tags=tag.toString().toUpperCase();
                  if(tags.length()==0) return result;
                  result.put(tags,"");
                  return result;
                }


    if(b== '=' )
                 {
                  b=readByte();
                  tagv=new StringBuffer();
                  /* mezery za = */
                  while(Character.isSpace((char)b))
                   b=readByte();

                  if(b=='"' || b=='\'')
                     {
                      byte endchar;
                      endchar=b;
                      /* jedeme az do dalsi " */
                      b=readByte();
                      /* HACK na chybejici ukoncovaci '" */
                      while(b!=endchar && b!='>')
                       {
                        tagv.append((char)b);
                        b=readByte();
                       }
                       b=readByte();
                     }
                  else
                    {
                      while(!Character.isSpace((char)b))
                       {
                        tagv.append((char)b);
                        b=readByte();
                        if(b=='>') break;
                       }
                    }
                  result.put(tag.toString().toUpperCase(),tagv.toString());
                  continue;
                 }
    result.put(tag.toString().toUpperCase(),"");
    }
//    return result;
  }


  private final String findByte(byte what) throws IOException
  {
      StringBuffer res;
      byte b;

      res=new StringBuffer(80);
	  
      while(what!= (b=readByte()) )
      {
	   res.append((char) (b & 0xFF) );
      }

   return res.toString().trim();
  }

  private final void findEndComment() throws IOException
  {
   // -->
   int dashsize=0;
   while(true)
   {
    byte b;
    b=readByte();
    if(b==0x2d)
      dashsize++;
    else
      if(b==0x3e)
        if(dashsize>=2)
          return;
        else
          dashsize=0;
      else
        dashsize=0;
   }
  }

  private final byte readByte() throws IOException
  {
    bytes++;
    if(buf==null) buf=new byte[BUFFERSIZE];
    if( bpos==bsize )
      {
       try
       {
        bpos=0;
        bsize=is.read(buf);
        // System.out.println("Reading from input. size="+bsize);
       }
       catch ( InterruptedIOException e2)
         { is.close();
           if(os!=null) os.close();
           throw e2;
         }

       if(bsize<1) { is.close();
                     if(os!=null) os.close();
                     throw new EOFException();
                   }
          else
           if(os!=null) try
                         {
                           os.write(buf,0,bsize);
                         }
                        catch (IOException ioe)
                        {
                         try
                          {
                           os.close();
                          }
                         catch(IOException ignore) {}
                         os=null;
                        }
      }

   return buf[bpos++];
  }

  public final void close() throws IOException
  {
   buf=null;
   bpos=bsize=0;
   if(os!=null) os.close();
   os=null;
   if(is==null) return;
   is.close();
   is=null;
  }

  public final void finalize() throws Throwable
  {
   close();
  }
  
  public final static void main(String argv[]) throws IOException
  {
      htmlscanner hscan;
      Hashtable res;

      FileInputStream is=new FileInputStream(argv[0]);
      hscan=new htmlscanner(is,null);
      try {
	      while(true)
	      {
		  res=hscan.getElement();
		  if(res==null) break; // EOF?
		  System.out.println("tag="+res.get("")+" content="+res.get(htmlscanner.CONTENT));
	      }
      }
     catch (EOFException wearedone)
     {}	  
     hscan.close();
  }

}
