//
// TokenFactory.java
//
//	Lexical analyzer for RichText component.  Parses a String into
//	a stream of Tokens.
//
//
//  Copyright (c) 1998, 2000 Silicon Graphics, Inc.  All Rights Reserved.
//  
//  This program is free software; you can redistribute it and/or modify
//  it under the terms of version 2.1 of the GNU Lesser General Public
//  License as published by the Free Software Foundation.
//  
//  This program is distributed in the hope that it would be useful, but
//  WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//  
//  Further, this software is distributed without any warranty that it is
//  free of the rightful claim of any third person regarding infringement
//  or the like.  Any license provided herein, whether implied or
//  otherwise, applies only to this software file.  Patent licenses, if
//  any, provided herein do not apply to combinations of this program
//  with other software, or any other product whatsoever.
//  
//  You should have received a copy of the GNU Lesser General Public
//  License along with this program; if not, write the Free Software
//  Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307,
//  USA.
//  
//  Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
//  Mountain View, CA 94043, or http://www.sgi.com/
//  
//  For further information regarding this notice, see:
//  http://oss.sgi.com/projects/GenInfo/NoticeExplan/
//

package com.sgi.sysadm.ui.richText;

import com.sgi.sysadm.util.*;
import java.text.*;

/**
 * TokenFactory parses a string of characters into a stream of Token
 * objects, which themselves are parsed by Document, Paragraph, and
 * List objects.
 */
class TokenFactory {
    private String _text;
    private Token _token = new Token();
    private boolean _unget = false;
    private int _index;
    private int _len;
    private BreakIterator _iter = BreakIterator.getLineInstance();

    // Introduces an escape sequences.
    static final char ESCAPE_BEGIN = '&';

    // Terminates an escape sequence.
    static final char ESCAPE_END = ';';

    // An element in our table of escapes.
    static class Escape {
	char _escapeChar;
	String _escapeStr;
	Escape(char escapeChar, String escapeStr) {
	    _escapeChar = escapeChar;
	    _escapeStr = escapeStr;
	}
    }

    // Our table of escapes.
    static Escape _escapes[] = {
	new Escape('<', "lt"),
	new Escape('>', "gt"),
	new Escape('&', "amp"),
    };

    /**
     * Construct a TokenFactory.
     * 
     * @param text The HTML text to parse.
     */
    TokenFactory(String text) {
	_text = text;
	_len = _text.length();
	_iter.setText(_text);
	_index = _iter.first();
    }

    /**
     * Retrieve the next Token from our string.  This is the grungy
     * character parsing code.
     * 
     * @return The next token.
     */
    Token getNextToken() {
	// This gives consumers one level of "put-back".
	if (_unget) {
	    Log.assert(_token.getType() != Token.INVALID_TOKEN,
		       "_unget set with invalid token");
	    _unget = false;
	    return _token;
	}

	_token.setType(Token.INVALID_TOKEN);
	while (_token.getType() == Token.INVALID_TOKEN) {
	    if (_index >= _len) {
		_token.setType(Token.TAG_TOKEN);
		_token.setTagValue(Token.END_DOCUMENT);
		continue;
	    }
	    // We look for line breaks as defined by BreakIterator,
	    // which should make this code localizable to Asian
	    // languages.  We also look for the '<' character to
	    // introduce HTML tags.  Each time through we parse up to
	    // whichever of these is coming up sooner.
	    int lineBreak = _iter.following(_index);
	    int tagStart = _text.indexOf('<', _index);
	    if (lineBreak == BreakIterator.DONE
		&& tagStart == -1) {
		_token.setType(Token.TAG_TOKEN);
		_token.setTagValue(Token.END_DOCUMENT);
	    } else if (tagStart == -1 ||
	       lineBreak != BreakIterator.DONE && lineBreak <= tagStart) {
		String word = _text.substring(_index, lineBreak);
		setType(word);
		_token.setOKToBreakAfter(true);
		_index = lineBreak;
	    } else if (lineBreak == BreakIterator.DONE
		       || tagStart != -1 && tagStart < lineBreak) {
		if (tagStart > _index) {
		    String word = _text.substring(_index, tagStart);
		    setType(word);
		    _token.setOKToBreakAfter(false);
		    _index = tagStart;
		    continue;
		}
		int index = _index;
		_index += 2;
		int end = _text.indexOf('>', _index);
		int len = end - tagStart - 1;
		if (end == -1) {
		    _index = _len;
		    _token.setType(Token.TAG_TOKEN);
		    _token.setTagValue(Token.END_DOCUMENT);
		    continue;
		}

		// Truncate the tag at the first space so that things
		// like "HREF=" don't mess us up.
		for (int i = index; i < index + len; i++) {
		    char ch = _text.charAt(i);
		    if (ch == ' ' || ch == '\t') {
			len = i - index - 1;
			break;
		    }
		}
		String tagString = _text.substring(index + 1,
						   index + 1 + len);
		int tag = Token.INVALID;
		for (int i = 0; Token.TAG_MAP[i] != null; i++) {
		    if (tagString.equalsIgnoreCase(Token.TAG_MAP[i])) {
			tag = i;
			break;
		    }
		}
		if (tag != Token.INVALID) {
		    _token.setType(Token.TAG_TOKEN);
		    _token.setTagValue(tag);
		    parseTagParams(_text.substring(index + 1 + len, end));
		}
		_index = end + 1;
	    } else {
		Log.fatal("TokenFactory error parsing: " + _text);
	    }
	}
	return _token;
    }

    /**
     * Causes the next call to getNextToken() to return the same
     * token that was returned last time, allowing a consumer to look
     * at the next token and then put it back for another consumer.
     */
    void unget() {
	Log.assert(_token.getType() != Token.INVALID_TOKEN,
		   "Attempt to unget an invalid token");
	Log.assert(!_unget, "Attempt to unget twice");
	_unget = true;
    }

    /**
     * Set a Token to either be a WORD token or a SPACE token.  By
     * treating strings of spaces specially we eliminate extra spaces
     * from the input.
     * 
     * @param word word which might just be a bunch of spaces.
     */
    private void setType(String word) {
	for (int i = 0; i < word.length(); i++) {
	    char ch = word.charAt(i);
	    if (ch != ' ' && ch != '\t'
		&& ch != '\n' && ch != '\r') {
		_token.setType(Token.WORD_TOKEN);
		_token.setWordValue(unescape(word));
		return;
	    }
	}
	_token.setType(Token.SPACE_TOKEN);
    }

    /**
     * Replace escape sequences in str with the characters they
     * represent.
     * 
     * @param str String to replace escape sequences in.
     * 
     * @return unescaped string.
     */
    static String unescape(String str) {
	int len = str.length();
	StringBuffer buf = new StringBuffer(len);
	int src = 0;
      chars:
	while (src < len) {
	    int semi;
	    char ch;
	    if ((ch = str.charAt(src)) == ESCAPE_BEGIN
		&& (semi = str.indexOf(ESCAPE_END, src)) != -1) {
		String escape = str.substring(src + 1, semi);
		for (int ii = 0; ii < _escapes.length; ii++) {
		    if (escape.equalsIgnoreCase(_escapes[ii]._escapeStr)) {
			buf.append(_escapes[ii]._escapeChar);
			src = semi + 1;
			continue chars;
		    }
		}
	    }
	    buf.append(ch);
	    src++;
	}
		    
	return buf.toString();
    }

    /**
     * Replace special characters with escape sequences.
     * 
     * @param str String to replace special characters in.
     * 
     * @return escaped string.
     */
    static String escape(String str) {
	int len = str.length();
	StringBuffer buf = new StringBuffer(len);

      chars:
	for (int ii = 0; ii < len; ii++) {
	    char ch = str.charAt(ii);
	    for (int esc = 0; esc < _escapes.length; esc++) {
		if (ch == _escapes[esc]._escapeChar) {
		    buf.append(ESCAPE_BEGIN +_escapes[esc]._escapeStr
			       + ESCAPE_END);
		    continue chars;
		}
	    }
	    buf.append(ch);
	}
	return buf.toString();
    }

    /**
     * Parse an HTML tag into key/value pairs.
     * 
     * @param params The key/value section of an HTML tag.
     */
    private void parseTagParams(String params) {
	_token.clearTagParams();
	int start = 0;
	int equals;
	while ((equals = params.indexOf('=', start)) != -1) {
	    int end = equals + 1;
	    char ch;
	    while (end < params.length() &&
		   (ch = params.charAt(end)) != ' '
		   && ch != '\t' && ch != '>') {
		end++;
	    }
	    _token.setTagParam(params.substring(start, equals).trim(),
			       params.substring(equals + 1, end));
	    start = equals + 1;
	}
    }
}
