From 2a58f19635d9c8b2c5a1ce293b6a3fc5f57be7d8 Mon Sep 17 00:00:00 2001
From: Achal-Aggarwal <theachalaggarwal@gmail.com>
Date: Tue, 11 Mar 2014 17:57:08 +0530
Subject: [PATCH 2/4] Replacing content model with states (except SCRIPT DATA).

---
 include/hubbub/parser.h                     |    6 +-
 include/hubbub/types.h                      |   13 +-
 src/parser.c                                |    4 +-
 src/tokeniser/tokeniser.c                   | 1224 +++++++++++++++++++++++----
 src/tokeniser/tokeniser.h                   |    6 +-
 src/treebuilder/in_body.c                   |    4 +-
 src/treebuilder/treebuilder.c               |    6 +-
 test/data/tokeniser2/contentModelFlags.test |   50 +-
 test/data/tokeniser2/escapeFlag.test        |   24 +-
 test/tokeniser2.c                           |   47 +-
 10 files changed, 1129 insertions(+), 255 deletions(-)

diff --git a/include/hubbub/parser.h b/include/hubbub/parser.h
index bdc5e20..42d68cc 100644
--- a/include/hubbub/parser.h
+++ b/include/hubbub/parser.h
@@ -29,7 +29,7 @@ typedef struct hubbub_parser hubbub_parser;
 typedef enum hubbub_parser_opttype {
 	HUBBUB_PARSER_TOKEN_HANDLER,
 	HUBBUB_PARSER_ERROR_HANDLER,
-	HUBBUB_PARSER_CONTENT_MODEL,
+	HUBBUB_PARSER_INITIAL_STATE,
 	HUBBUB_PARSER_TREE_HANDLER,
 	HUBBUB_PARSER_DOCUMENT_NODE,
 	HUBBUB_PARSER_ENABLE_SCRIPTING,
@@ -51,8 +51,8 @@ typedef union hubbub_parser_optparams {
 	} error_handler;		/**< Error handling callback */
 
 	struct {
-		hubbub_content_model model;
-	} content_model;		/**< Current content model */
+		hubbub_initial_state state;
+	} initial_state;		/**< Initial state of tokeniser */
 
 	hubbub_tree_handler *tree_handler;	/**< Tree handling callbacks */
 
diff --git a/include/hubbub/types.h b/include/hubbub/types.h
index e5c208b..6e14fb7 100644
--- a/include/hubbub/types.h
+++ b/include/hubbub/types.h
@@ -29,12 +29,13 @@ typedef enum hubbub_charset_source {
 /**
  * Content model flag
  */
-typedef enum hubbub_content_model {
-	HUBBUB_CONTENT_MODEL_PCDATA,
-	HUBBUB_CONTENT_MODEL_RCDATA,
-	HUBBUB_CONTENT_MODEL_CDATA,
-	HUBBUB_CONTENT_MODEL_PLAINTEXT
-} hubbub_content_model;
+typedef enum hubbub_initial_state {
+	HUBBUB_INITIAL_STATE_DATA,
+	HUBBUB_INITIAL_STATE_RCDATA,
+	HUBBUB_INITIAL_STATE_CDATA,
+	HUBBUB_INITIAL_STATE_PLAINTEXT,
+	HUBBUB_INITIAL_STATE_RAWTEXT
+} hubbub_initial_state;
 
 /**
  * Quirks mode flag
diff --git a/src/parser.c b/src/parser.c
index 671e129..749c674 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -160,9 +160,9 @@ hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
 		}
 		break;
 
-	case HUBBUB_PARSER_CONTENT_MODEL:
+	case HUBBUB_PARSER_INITIAL_STATE:
 		result = hubbub_tokeniser_setopt(parser->tok,
-				HUBBUB_TOKENISER_CONTENT_MODEL,
+				HUBBUB_TOKENISER_INITIAL_STATE,
 				(hubbub_tokeniser_optparams *) params);
 		break;
 
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 3eab8a7..7152f05 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -43,16 +43,26 @@ static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
 static const uint8_t lf = '\n';
 static const hubbub_string lf_str = { &lf, 1 };
 
-
 /**
  * Tokeniser states
  */
 typedef enum hubbub_tokeniser_state {
 	STATE_DATA,
 	STATE_CHARACTER_REFERENCE_DATA,
+	STATE_RCDATA,
+	STATE_CHARACTER_REFERENCE_RCDATA,
+	STATE_RAWTEXT,
+	STATE_SCRIPT_DATA,
+	STATE_PLAINTEXT,
 	STATE_TAG_OPEN,
 	STATE_CLOSE_TAG_OPEN,
 	STATE_TAG_NAME,
+	STATE_RCDATA_LESSTHAN,
+	STATE_RCDATA_CLOSE_TAG_OPEN,
+	STATE_RCDATA_CLOSE_TAG_NAME,
+	STATE_RAWTEXT_LESSTHAN,
+	STATE_RAWTEXT_CLOSE_TAG_OPEN,
+	STATE_RAWTEXT_CLOSE_TAG_NAME,
 	STATE_BEFORE_ATTRIBUTE_NAME,
 	STATE_ATTRIBUTE_NAME,
 	STATE_AFTER_ATTRIBUTE_NAME,
@@ -166,8 +176,6 @@ typedef struct hubbub_tokeniser_context {
  */
 struct hubbub_tokeniser {
 	hubbub_tokeniser_state state;	/**< Current tokeniser state */
-	hubbub_content_model content_model;	/**< Current content
-						 * model flag */
 	bool escape_flag;		/**< Escape flag **/
 	bool process_cdata_section;	/**< Whether to process CDATA sections*/
 	bool paused; /**< flag for if parsing is currently paused */
@@ -188,12 +196,34 @@ struct hubbub_tokeniser {
 static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
 static hubbub_error hubbub_tokeniser_handle_character_reference_data(
 		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_rcdata(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_character_reference_rcdata(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_rawtext(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_script_data(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_plaintext(
+		hubbub_tokeniser *tokeniser);
 static hubbub_error hubbub_tokeniser_handle_tag_open(
 		hubbub_tokeniser *tokeniser);
 static hubbub_error hubbub_tokeniser_handle_close_tag_open(
 		hubbub_tokeniser *tokeniser);
 static hubbub_error hubbub_tokeniser_handle_tag_name(
 		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_rcdata_lessthan(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_rcdata_close_tag_open(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_rcdata_close_tag_name(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_rawtext_lessthan(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_rawtext_close_tag_open(
+		hubbub_tokeniser *tokeniser);
+static hubbub_error hubbub_tokeniser_handle_rawtext_close_tag_name(
+		hubbub_tokeniser *tokeniser);
 static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
 		hubbub_tokeniser *tokeniser);
 static hubbub_error hubbub_tokeniser_handle_attribute_name(
@@ -313,7 +343,6 @@ hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
 	}
 
 	tok->state = STATE_DATA;
-	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
 
 	tok->escape_flag = false;
 	tok->process_cdata_section = false;
@@ -385,8 +414,18 @@ hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
 		tokeniser->error_handler = params->error_handler.handler;
 		tokeniser->error_pw = params->error_handler.pw;
 		break;
-	case HUBBUB_TOKENISER_CONTENT_MODEL:
-		tokeniser->content_model = params->content_model.model;
+	case HUBBUB_TOKENISER_INITIAL_STATE:
+		if (params->initial_state.state == HUBBUB_INITIAL_STATE_DATA) {
+			tokeniser->state = STATE_DATA;
+		} else if (params->initial_state.state == HUBBUB_INITIAL_STATE_RCDATA) {
+			tokeniser->state = STATE_RCDATA;
+		} else if (params->initial_state.state == HUBBUB_INITIAL_STATE_CDATA) {
+			tokeniser->state = STATE_CDATA_BLOCK;
+		} else if (params->initial_state.state == HUBBUB_INITIAL_STATE_PLAINTEXT) {
+			tokeniser->state = STATE_PLAINTEXT;
+		} else if (params->initial_state.state == HUBBUB_INITIAL_STATE_RAWTEXT) {
+			tokeniser->state = STATE_RAWTEXT;
+		}  
 		break;
 	case HUBBUB_TOKENISER_PROCESS_CDATA:
 		tokeniser->process_cdata_section = params->process_cdata;
@@ -465,6 +504,26 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
 			cont = hubbub_tokeniser_handle_character_reference_data(
 					tokeniser);
 			break;
+		state(STATE_RCDATA)
+			cont = hubbub_tokeniser_handle_rcdata(
+					tokeniser);
+			break;
+		state(STATE_CHARACTER_REFERENCE_RCDATA)
+			cont = hubbub_tokeniser_handle_character_reference_rcdata(
+					tokeniser);
+			break;
+		state(STATE_RAWTEXT)
+			cont = hubbub_tokeniser_handle_rawtext(
+					tokeniser);
+			break;
+		state(STATE_SCRIPT_DATA)
+			cont = hubbub_tokeniser_handle_script_data(
+					tokeniser);
+			break;
+		state(STATE_PLAINTEXT)
+			cont = hubbub_tokeniser_handle_plaintext(
+					tokeniser);
+			break;
 		state(STATE_TAG_OPEN)
 			cont = hubbub_tokeniser_handle_tag_open(tokeniser);
 			break;
@@ -475,6 +534,30 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
 		state(STATE_TAG_NAME)
 			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
 			break;
+		state(STATE_RCDATA_LESSTHAN)
+			cont = hubbub_tokeniser_handle_rcdata_lessthan(
+					tokeniser);
+			break;
+		state(STATE_RCDATA_CLOSE_TAG_OPEN)
+			cont = hubbub_tokeniser_handle_rcdata_close_tag_open(
+					tokeniser);
+			break;
+		state(STATE_RCDATA_CLOSE_TAG_NAME)
+			cont = hubbub_tokeniser_handle_rcdata_close_tag_name(
+					tokeniser);
+			break;
+		state(STATE_RAWTEXT_LESSTHAN)
+			cont = hubbub_tokeniser_handle_rawtext_lessthan(
+					tokeniser);
+			break;
+		state(STATE_RAWTEXT_CLOSE_TAG_OPEN)
+			cont = hubbub_tokeniser_handle_rawtext_close_tag_open(
+					tokeniser);
+			break;
+		state(STATE_RAWTEXT_CLOSE_TAG_NAME)
+			cont = hubbub_tokeniser_handle_rawtext_close_tag_name(
+					tokeniser);
+			break;
 		state(STATE_BEFORE_ATTRIBUTE_NAME)
 			cont = hubbub_tokeniser_handle_before_attribute_name(
 					tokeniser);
@@ -703,103 +786,899 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 				break;
 			}
 
-			if (tokeniser->context.pending > 0) {
-				/* Emit any pending characters */
-				emit_current_chars(tokeniser);
-			}
+			if (tokeniser->context.pending > 0) {
+				/* Emit any pending characters */
+				emit_current_chars(tokeniser);
+			}
+
+			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
+				/* Emit newline */
+				emit_character_token(tokeniser, &lf_str);
+			}
+
+			/* Advance over */
+			parserutils_inputstream_advance(tokeniser->input, 1);
+		} else {
+			if (c == '\0') {
+				/** \todo parse error */
+			}
+
+			/* Just collect into buffer */
+			tokeniser->context.pending += len;
+		}
+	}
+	if (tokeniser->state != STATE_TAG_OPEN &&
+		(tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
+			tokeniser->context.pending > 0) {
+		/* Emit any pending characters */
+		emit_current_chars(tokeniser);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		token.type = HUBBUB_TOKEN_EOF;
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		return HUBBUB_NEEDDATA;
+	} else {
+		return hubbub_error_from_parserutils_error(error);
+	}
+}
+
+
+
+/* emit any pending tokens before calling */
+hubbub_error hubbub_tokeniser_handle_character_reference_data(
+		hubbub_tokeniser *tokeniser)
+{
+	assert(tokeniser->context.pending == 0);
+
+	if (tokeniser->context.match_entity.complete == false) {
+		return hubbub_tokeniser_consume_character_reference(tokeniser,
+				tokeniser->context.pending);
+	} else {
+		hubbub_token token;
+
+		uint8_t utf8[6];
+		uint8_t *utf8ptr = utf8;
+		size_t len = sizeof(utf8);
+
+		token.type = HUBBUB_TOKEN_CHARACTER;
+
+		if (tokeniser->context.match_entity.codepoint) {
+			parserutils_charset_utf8_from_ucs4(
+				tokeniser->context.match_entity.codepoint,
+				&utf8ptr, &len);
+
+			token.data.character.ptr = utf8;
+			token.data.character.len = sizeof(utf8) - len;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			/* +1 for ampersand */
+			parserutils_inputstream_advance(tokeniser->input,
+					tokeniser->context.match_entity.length
+							+ 1);
+		} else {
+			parserutils_error error;
+			const uint8_t *cptr = NULL;
+
+			error = parserutils_inputstream_peek(
+					tokeniser->input,
+					tokeniser->context.pending,
+					&cptr,
+					&len);
+			if (error != PARSERUTILS_OK) {
+				return hubbub_error_from_parserutils_error(
+						error);
+			}
+
+			token.data.character.ptr = cptr;
+			token.data.character.len = len;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+			parserutils_inputstream_advance(tokeniser->input, len);
+		}
+
+		/* Reset for next time */
+		tokeniser->context.match_entity.complete = false;
+
+		tokeniser->state = STATE_DATA;
+	}
+
+	return HUBBUB_OK;
+}
+
+hubbub_error hubbub_tokeniser_handle_rcdata(hubbub_tokeniser *tokeniser)
+{
+	parserutils_error error;
+	hubbub_token token;
+	const uint8_t *cptr;
+	size_t len;
+
+	while ((error = parserutils_inputstream_peek(tokeniser->input,
+			tokeniser->context.pending, &cptr, &len)) ==
+					PARSERUTILS_OK) {
+		const uint8_t c = *cptr;
+
+		if (c == '&') {
+			tokeniser->state = STATE_CHARACTER_REFERENCE_RCDATA;
+			/* Don't eat the '&'; it'll be handled by entity
+			 * consumption */
+			break;
+		} else if (c == '<') {
+			if (tokeniser->context.pending > 0) {
+				/* Emit any pending characters */
+				emit_current_chars(tokeniser);
+			}
+
+			/* Buffer '<' */
+			tokeniser->context.pending = len;
+			tokeniser->state = STATE_RCDATA_LESSTHAN;
+			break;
+		} else if (c == '\r') {
+			error = parserutils_inputstream_peek(
+					tokeniser->input,
+					tokeniser->context.pending + len,
+					&cptr,
+					&len);
+
+			if (error != PARSERUTILS_OK && 
+					error != PARSERUTILS_EOF) {
+				break;
+			}
+
+			if (tokeniser->context.pending > 0) {
+				/* Emit any pending characters */
+				emit_current_chars(tokeniser);
+			}
+
+			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
+				/* Emit newline */
+				emit_character_token(tokeniser, &lf_str);
+			}
+
+			/* Advance over */
+			parserutils_inputstream_advance(tokeniser->input, 1);
+		} else {
+			if (c == '\0') {
+				/** \todo parse error */
+				error = parserutils_buffer_append(tokeniser->buffer,
+				u_fffd, sizeof(u_fffd));
+				if (error != PARSERUTILS_OK)
+					return hubbub_error_from_parserutils_error(error);
+			}
+
+			/* Just collect into buffer */
+			tokeniser->context.pending += len;
+		}
+	}
+	if (tokeniser->state != STATE_RCDATA_LESSTHAN &&
+		(tokeniser->state != STATE_RCDATA || error == PARSERUTILS_EOF) &&
+			tokeniser->context.pending > 0) {
+		/* Emit any pending characters */
+		emit_current_chars(tokeniser);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		token.type = HUBBUB_TOKEN_EOF;
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		return HUBBUB_NEEDDATA;
+	} else {
+		return hubbub_error_from_parserutils_error(error);
+	}
+}
+
+
+
+/* emit any pending tokens before calling */
+hubbub_error hubbub_tokeniser_handle_character_reference_rcdata(
+		hubbub_tokeniser *tokeniser)
+{
+	assert(tokeniser->context.pending == 0);
+
+	if (tokeniser->context.match_entity.complete == false) {
+		return hubbub_tokeniser_consume_character_reference(tokeniser,
+				tokeniser->context.pending);
+	} else {
+		hubbub_token token;
+
+		uint8_t utf8[6];
+		uint8_t *utf8ptr = utf8;
+		size_t len = sizeof(utf8);
+
+		token.type = HUBBUB_TOKEN_CHARACTER;
+
+		if (tokeniser->context.match_entity.codepoint) {
+			parserutils_charset_utf8_from_ucs4(
+				tokeniser->context.match_entity.codepoint,
+				&utf8ptr, &len);
+
+			token.data.character.ptr = utf8;
+			token.data.character.len = sizeof(utf8) - len;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+
+			/* +1 for ampersand */
+			parserutils_inputstream_advance(tokeniser->input,
+					tokeniser->context.match_entity.length
+							+ 1);
+		} else {
+			parserutils_error error;
+			const uint8_t *cptr = NULL;
+
+			error = parserutils_inputstream_peek(
+					tokeniser->input,
+					tokeniser->context.pending,
+					&cptr,
+					&len);
+			if (error != PARSERUTILS_OK) {
+				return hubbub_error_from_parserutils_error(
+						error);
+			}
+
+			token.data.character.ptr = cptr;
+			token.data.character.len = len;
+
+			hubbub_tokeniser_emit_token(tokeniser, &token);
+			parserutils_inputstream_advance(tokeniser->input, len);
+		}
+
+		/* Reset for next time */
+		tokeniser->context.match_entity.complete = false;
+
+		tokeniser->state = STATE_RCDATA;
+	}
+
+	return HUBBUB_OK;
+}
+
+hubbub_error hubbub_tokeniser_handle_rawtext(hubbub_tokeniser *tokeniser)
+{
+	parserutils_error error;
+	hubbub_token token;
+	const uint8_t *cptr;
+	size_t len;
+
+	while ((error = parserutils_inputstream_peek(tokeniser->input,
+			tokeniser->context.pending, &cptr, &len)) ==
+					PARSERUTILS_OK) {
+		const uint8_t c = *cptr;
+
+		if (c == '<') {
+			if (tokeniser->context.pending > 0) {
+				/* Emit any pending characters */
+				emit_current_chars(tokeniser);
+			}
+
+			/* Buffer '<' */
+			tokeniser->context.pending = len;
+			tokeniser->state = STATE_RAWTEXT_LESSTHAN;
+			break;
+		} else if (c == '\r') {
+			error = parserutils_inputstream_peek(
+					tokeniser->input,
+					tokeniser->context.pending + len,
+					&cptr,
+					&len);
+
+			if (error != PARSERUTILS_OK && 
+					error != PARSERUTILS_EOF) {
+				break;
+			}
+
+			if (tokeniser->context.pending > 0) {
+				/* Emit any pending characters */
+				emit_current_chars(tokeniser);
+			}
+
+			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
+				/* Emit newline */
+				emit_character_token(tokeniser, &lf_str);
+			}
+
+			/* Advance over */
+			parserutils_inputstream_advance(tokeniser->input, 1);
+		} else {
+			if (c == '\0') {
+				/** \todo parse error */
+				error = parserutils_buffer_append(tokeniser->buffer,
+				u_fffd, sizeof(u_fffd));
+				if (error != PARSERUTILS_OK)
+					return hubbub_error_from_parserutils_error(error);
+			}
+
+			/* Just collect into buffer */
+			tokeniser->context.pending += len;
+		}
+	}
+	if (tokeniser->state != STATE_RAWTEXT_LESSTHAN &&
+		(tokeniser->state != STATE_RAWTEXT || error == PARSERUTILS_EOF) &&
+			tokeniser->context.pending > 0) {
+		/* Emit any pending characters */
+		emit_current_chars(tokeniser);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		token.type = HUBBUB_TOKEN_EOF;
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		return HUBBUB_NEEDDATA;
+	} else {
+		return hubbub_error_from_parserutils_error(error);
+	}
+}
+
+hubbub_error hubbub_tokeniser_handle_script_data(hubbub_tokeniser *tokeniser)
+{
+	parserutils_error error;
+	hubbub_token token;
+	const uint8_t *cptr;
+	size_t len;
+
+	while ((error = parserutils_inputstream_peek(tokeniser->input,
+			tokeniser->context.pending, &cptr, &len)) ==
+					PARSERUTILS_OK) {
+		const uint8_t c = *cptr;
+
+		if (c == '<') {
+			if (tokeniser->context.pending > 0) {
+				/* Emit any pending characters */
+				emit_current_chars(tokeniser);
+			}
+
+			/* Buffer '<' */
+			tokeniser->context.pending = len;
+			////////tokeniser->state = STATE_SCRIPT_DATA_LESSTHAN;
+			break;
+		} else if (c == '\r') {
+			error = parserutils_inputstream_peek(
+					tokeniser->input,
+					tokeniser->context.pending + len,
+					&cptr,
+					&len);
+
+			if (error != PARSERUTILS_OK && 
+					error != PARSERUTILS_EOF) {
+				break;
+			}
+
+			if (tokeniser->context.pending > 0) {
+				/* Emit any pending characters */
+				emit_current_chars(tokeniser);
+			}
+
+			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
+				/* Emit newline */
+				emit_character_token(tokeniser, &lf_str);
+			}
+
+			/* Advance over */
+			parserutils_inputstream_advance(tokeniser->input, 1);
+		} else {
+			if (c == '\0') {
+				/** \todo parse error */
+				error = parserutils_buffer_append(tokeniser->buffer,
+				u_fffd, sizeof(u_fffd));
+				if (error != PARSERUTILS_OK)
+					return hubbub_error_from_parserutils_error(error);
+			}
+
+			/* Just collect into buffer */
+			tokeniser->context.pending += len;
+		}
+	}
+	if (tokeniser->state != STATE_SCRIPT_DATA &&
+		(/*tokeniser->state != STATE_SCRIPT_DATA_LESSTHAN || */error == PARSERUTILS_EOF) &&
+			tokeniser->context.pending > 0) {
+		/* Emit any pending characters */
+		emit_current_chars(tokeniser);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		token.type = HUBBUB_TOKEN_EOF;
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		return HUBBUB_NEEDDATA;
+	} else {
+		return hubbub_error_from_parserutils_error(error);
+	}
+}
+
+hubbub_error hubbub_tokeniser_handle_plaintext(hubbub_tokeniser *tokeniser)
+{
+	parserutils_error error;
+	hubbub_token token;
+	const uint8_t *cptr;
+	size_t len;
+
+	while ((error = parserutils_inputstream_peek(tokeniser->input,
+			tokeniser->context.pending, &cptr, &len)) ==
+					PARSERUTILS_OK) {
+		const uint8_t c = *cptr;
+
+		if (c == '\r') {
+			error = parserutils_inputstream_peek(
+					tokeniser->input,
+					tokeniser->context.pending + len,
+					&cptr,
+					&len);
+
+			if (error != PARSERUTILS_OK && 
+					error != PARSERUTILS_EOF) {
+				break;
+			}
+
+			if (tokeniser->context.pending > 0) {
+				/* Emit any pending characters */
+				emit_current_chars(tokeniser);
+			}
+
+			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
+				/* Emit newline */
+				emit_character_token(tokeniser, &lf_str);
+			}
+
+			/* Advance over */
+			parserutils_inputstream_advance(tokeniser->input, 1);
+		} else {
+			if (c == '\0') {
+				/** \todo parse error */
+				error = parserutils_buffer_append(tokeniser->buffer,
+				u_fffd, sizeof(u_fffd));
+				if (error != PARSERUTILS_OK)
+					return hubbub_error_from_parserutils_error(error);
+			}
+
+			/* Just collect into buffer */
+			tokeniser->context.pending += len;
+		}
+	}
+	if (tokeniser->context.pending > 0) {
+		/* Emit any pending characters */
+		emit_current_chars(tokeniser);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		token.type = HUBBUB_TOKEN_EOF;
+		hubbub_tokeniser_emit_token(tokeniser, &token);
+	}
+
+	if (error == PARSERUTILS_EOF) {
+		return HUBBUB_NEEDDATA;
+	} else {
+		return hubbub_error_from_parserutils_error(error);
+	}
+}
+
+/* this state always switches to another state straight away */
+/* this state expects the current character to be '<' */
+hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+
+	size_t len;
+	const uint8_t *cptr;
+	parserutils_error error;
+	uint8_t c;
+
+	assert(tokeniser->context.pending == 1);
+/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
+
+	error = parserutils_inputstream_peek(tokeniser->input, 
+			tokeniser->context.pending, &cptr, &len);
+
+	if (error != PARSERUTILS_OK) {
+		if (error == PARSERUTILS_EOF) {
+			/** \todo parse error */
+			/* Emit single '<' char  */
+			emit_current_chars(tokeniser);
+			tokeniser->state = STATE_DATA;
+			return HUBBUB_OK;
+		} else {
+			return hubbub_error_from_parserutils_error(error);
+		}
+	}
+
+	c = *cptr;
+
+	if (c == '!') {
+		parserutils_inputstream_advance(tokeniser->input, SLEN("<!"));
+
+		tokeniser->context.pending = 0;
+		tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
+	} else if (c == '/'){
+		tokeniser->context.pending += len;
+
+		tokeniser->context.close_tag_match.match = false;
+		tokeniser->context.close_tag_match.count = 0;
+
+		tokeniser->state = STATE_CLOSE_TAG_OPEN;
+	} else if ('A' <= c && c <= 'Z') {
+		uint8_t lc = (c + 0x20);
+
+		START_BUF(ctag->name, &lc, len);
+		ctag->n_attributes = 0;
+		tokeniser->context.current_tag_type =
+				HUBBUB_TOKEN_START_TAG;
+
+		tokeniser->context.pending += len;
+
+		tokeniser->state = STATE_TAG_NAME;
+	} else if ('a' <= c && c <= 'z') {
+		START_BUF(ctag->name, cptr, len);
+		ctag->n_attributes = 0;
+		tokeniser->context.current_tag_type =
+				HUBBUB_TOKEN_START_TAG;
+
+		tokeniser->context.pending += len;
+
+		tokeniser->state = STATE_TAG_NAME;
+	} else if (c == '?'){
+		/** \todo parse error */
+		/* Cursor still at "<", need to advance past it */
+			parserutils_inputstream_advance(
+					tokeniser->input, SLEN("<"));
+			tokeniser->context.pending = 0;
+
+			tokeniser->state = STATE_BOGUS_COMMENT;
+	} else {
+		/** \todo parse error */
+		/* Emit single '<' char  */
+		emit_current_chars(tokeniser);
+		tokeniser->state = STATE_DATA;
+	}
+
+
+	return HUBBUB_OK;
+}
+
+/* this state expects tokeniser->context.chars to be "</" */
+/* this state never stays in this state for more than one character */
+hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
+{
+	size_t len;
+	const uint8_t *cptr;
+	parserutils_error error;
+	uint8_t c;
+
+	assert(tokeniser->context.pending == 2);
+/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
+/*	assert(tokeniser->context.chars.ptr[1] == '/'); */
+
+	error = parserutils_inputstream_peek(tokeniser->input, 
+			tokeniser->context.pending, &cptr, &len);
+
+	if (error != PARSERUTILS_OK) {
+		if (error == PARSERUTILS_EOF) {
+			/** \todo parse error */
+			/* Emit '</' chars  */
+			emit_current_chars(tokeniser);
+			tokeniser->state = STATE_DATA;
+			return HUBBUB_OK;
+		} else {
+			return hubbub_error_from_parserutils_error(error);
+		}
+	}
+
+	c = *cptr;
+
+	if ('A' <= c && c <= 'Z') {
+		uint8_t lc = (c + 0x20);
+		START_BUF(tokeniser->context.current_tag.name,
+				&lc, len);
+		tokeniser->context.current_tag.n_attributes = 0;
+
+		tokeniser->context.current_tag_type = HUBBUB_TOKEN_END_TAG;
+
+		tokeniser->context.pending += len;
+
+		tokeniser->state = STATE_TAG_NAME;
+	} else if ('a' <= c && c <= 'z') {
+		START_BUF(tokeniser->context.current_tag.name,
+				cptr, len);
+		tokeniser->context.current_tag.n_attributes = 0;
+
+		tokeniser->context.current_tag_type = HUBBUB_TOKEN_END_TAG;
+
+		tokeniser->context.pending += len;
+
+		tokeniser->state = STATE_TAG_NAME;
+	} else if (c == '>') {
+		/** \todo parse error */
+
+		/* Cursor still at "</", need to collect ">" */
+		tokeniser->context.pending += len;
+
+		/* Now need to advance past "</>" */
+		parserutils_inputstream_advance(tokeniser->input,
+				tokeniser->context.pending);
+		tokeniser->context.pending = 0;
+
+		tokeniser->state = STATE_DATA;
+	} else {
+		/** \todo parse error */
+
+		/* Cursor still at "</", need to advance past it */
+		parserutils_inputstream_advance(tokeniser->input,
+				tokeniser->context.pending);
+		tokeniser->context.pending = 0;
+
+		tokeniser->state = STATE_BOGUS_COMMENT;
+	}
+
+	return HUBBUB_OK;
+}
+
+/* this state expects tokeniser->context.current_tag to already have its
+   first character set */
+hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tag *ctag = &tokeniser->context.current_tag;
+
+	size_t len;
+	const uint8_t *cptr;
+	parserutils_error error;
+	uint8_t c;
+
+	assert(tokeniser->context.pending > 0);
+/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
+	assert(ctag->name.len > 0);
+/*	assert(ctag->name.ptr); */
+
+	error = parserutils_inputstream_peek(tokeniser->input, 
+			tokeniser->context.pending, &cptr, &len);
+
+	if (error != PARSERUTILS_OK) {
+		if (error == PARSERUTILS_EOF) {
+			/** \todo parse error */
+			tokeniser->state = STATE_DATA;
+
+			// skips all pending charachters
+			parserutils_inputstream_advance(
+					tokeniser->input, tokeniser->context.pending);
+			return HUBBUB_OK;
+		} else {
+			return hubbub_error_from_parserutils_error(error);
+		}
+	}
+
+	c = *cptr;
+
+	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+		tokeniser->context.pending += len;
+		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
+	} else if (c == '/') {
+		tokeniser->context.pending += len;
+		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
+	} else if (c == '>') {
+		tokeniser->context.pending += len;
+		tokeniser->state = STATE_DATA;
+		return emit_current_tag(tokeniser);
+	} else if ('A' <= c && c <= 'Z') {
+		uint8_t lc = (c + 0x20);
+		COLLECT(ctag->name, &lc, len);
+		tokeniser->context.pending += len;
+	} else if (c == '\0') {
+		COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
+		tokeniser->context.pending += len;
+	} else {
+		COLLECT(ctag->name, cptr, len);
+		tokeniser->context.pending += len;
+	}
+
+	return HUBBUB_OK;
+}
+
+/* this state always switches to another state straight away */
+/* this state expects the current character to be '<' */
+hubbub_error hubbub_tokeniser_handle_rcdata_lessthan(hubbub_tokeniser *tokeniser)
+{
+	//hubbub_tag *ctag = &tokeniser->context.current_tag;
+
+	size_t len;
+	const uint8_t *cptr;
+	parserutils_error error;
+	uint8_t c;
+
+	assert(tokeniser->context.pending == 1);
+/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
+
+	error = parserutils_inputstream_peek(tokeniser->input, 
+			tokeniser->context.pending, &cptr, &len);
+
+	if (error != PARSERUTILS_OK) {
+		if (error == PARSERUTILS_EOF) {
+			/** \todo parse error */
+			/* Emit single '<' char  */
+			emit_current_chars(tokeniser);
+			tokeniser->state = STATE_RCDATA;
+			return HUBBUB_OK;
+		} else {
+			return hubbub_error_from_parserutils_error(error);
+		}
+	}
+
+	c = *cptr;
+
+	if (c == '/'){
+		tokeniser->context.pending += len;
+
+		tokeniser->context.close_tag_match.match = false;
+		tokeniser->context.close_tag_match.count = 0;
+
+		tokeniser->state = STATE_RCDATA_CLOSE_TAG_OPEN;
+	} else {
+		/* Emit single '<' char  */
+		emit_current_chars(tokeniser);
+		tokeniser->state = STATE_RCDATA;
+	}
+
+
+	return HUBBUB_OK;
+}
+
+/* this state expects tokeniser->context.chars to be "</" */
+/* this state never stays in this state for more than one character */
+hubbub_error hubbub_tokeniser_handle_rcdata_close_tag_open(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
+
+	size_t len;
+	const uint8_t *cptr;
+	parserutils_error error;
+	uint8_t c;
+
+	assert(tokeniser->context.pending == 2);
+/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
+/*	assert(tokeniser->context.chars.ptr[1] == '/'); */
+
+	uint8_t *start_tag_name =
+			tokeniser->context.last_start_tag_name;
+	size_t start_tag_len =
+		tokeniser->context.last_start_tag_len;
 
-			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
-				/* Emit newline */
-				emit_character_token(tokeniser, &lf_str);
-			}
+	while ((error = parserutils_inputstream_peek(tokeniser->input,
+				ctx->pending +
+					ctx->close_tag_match.count,
+				&cptr,
+				&len)) == PARSERUTILS_OK) {
+		c = *cptr;
 
-			/* Advance over */
-			parserutils_inputstream_advance(tokeniser->input, 1);
-		} else {
-			if (c == '\0') {
-				/** \todo parse error */
-			}
+		if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
+				!= (c & ~0x20)) {
+			break;
+		}
 
-			/* Just collect into buffer */
-			tokeniser->context.pending += len;
+		ctx->close_tag_match.count += len;
+
+		if (ctx->close_tag_match.count == start_tag_len) {
+
+			// Sets the flag to be used in name state.
+			ctx->close_tag_match.match = true;
+			break;
 		}
 	}
-	if (tokeniser->state != STATE_TAG_OPEN &&
-		(tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
-			tokeniser->context.pending > 0) {
-		/* Emit any pending characters */
-		emit_current_chars(tokeniser);
-	}
 
-	if (error == PARSERUTILS_EOF) {
-		token.type = HUBBUB_TOKEN_EOF;
-		hubbub_tokeniser_emit_token(tokeniser, &token);
+	if (error != PARSERUTILS_OK) {
+		if (error == PARSERUTILS_EOF) {
+			tokeniser->state = STATE_RCDATA;
+			tokeniser->context.pending += ctx->close_tag_match.count;
+			return HUBBUB_OK;
+		} else {
+			return hubbub_error_from_parserutils_error(error);
+		}
 	}
 
-	if (error == PARSERUTILS_EOF) {
-		return HUBBUB_NEEDDATA;
-	} else {
-		return hubbub_error_from_parserutils_error(error);
+	if (ctx->close_tag_match.match == true) {
+		error = parserutils_inputstream_peek(
+				 		tokeniser->input,
+				 		ctx->pending +
+					 		ctx->close_tag_match.count,
+						&cptr,
+				 		&len);
+
+		if (error != PARSERUTILS_OK && 
+				error != PARSERUTILS_EOF) {
+			return hubbub_error_from_parserutils_error(
+					error);
+		} else if (error != PARSERUTILS_EOF) {
+			c = *cptr;
+
+			if (c != '\t' && c != '\n' && c != '\f' && c != '\r' && 
+					c != ' ' && c != '>' &&
+					c != '/') {
+				ctx->close_tag_match.match = false;
+			}
+		}
 	}
-}
 
-/* emit any pending tokens before calling */
-hubbub_error hubbub_tokeniser_handle_character_reference_data(
-		hubbub_tokeniser *tokeniser)
-{
-	assert(tokeniser->context.pending == 0);
+	if (ctx->close_tag_match.match == true) {
 
-	if (tokeniser->context.match_entity.complete == false) {
-		return hubbub_tokeniser_consume_character_reference(tokeniser,
-				tokeniser->context.pending);
-	} else {
-		hubbub_token token;
+		tokeniser->state = STATE_RCDATA_CLOSE_TAG_NAME;
 
-		uint8_t utf8[6];
-		uint8_t *utf8ptr = utf8;
-		size_t len = sizeof(utf8);
+		// Creates a new buffer and sets first charachter of the tag name 
+		START_BUF(ctx->current_tag.name,
+					&start_tag_name[0], len);
+		tokeniser->context.current_tag.n_attributes = 0;
 
-		token.type = HUBBUB_TOKEN_CHARACTER;
+		tokeniser->context.current_tag_type =
+				HUBBUB_TOKEN_END_TAG;
 
-		if (tokeniser->context.match_entity.codepoint) {
-			parserutils_charset_utf8_from_ucs4(
-				tokeniser->context.match_entity.codepoint,
-				&utf8ptr, &len);
+		tokeniser->context.pending += len;
 
-			token.data.character.ptr = utf8;
-			token.data.character.len = sizeof(utf8) - len;
+		tokeniser->state = STATE_RCDATA_CLOSE_TAG_NAME;
+	} else {
+		emit_current_chars(tokeniser);
+		tokeniser->state = STATE_RCDATA;
+	}
 
-			hubbub_tokeniser_emit_token(tokeniser, &token);
+	return HUBBUB_OK;
+}
 
-			/* +1 for ampersand */
-			parserutils_inputstream_advance(tokeniser->input,
-					tokeniser->context.match_entity.length
-							+ 1);
-		} else {
-			parserutils_error error;
-			const uint8_t *cptr = NULL;
+/* this state expects tokeniser->context.current_tag to already have its
+   first character set */
+hubbub_error hubbub_tokeniser_handle_rcdata_close_tag_name(hubbub_tokeniser *tokeniser)
+{
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
 
-			error = parserutils_inputstream_peek(
-					tokeniser->input,
-					tokeniser->context.pending,
-					&cptr,
-					&len);
-			if (error != PARSERUTILS_OK) {
-				return hubbub_error_from_parserutils_error(
-						error);
-			}
+	size_t len;
+	const uint8_t *cptr;
+	parserutils_error error;
+	uint8_t c;
 
-			token.data.character.ptr = cptr;
-			token.data.character.len = len;
+	assert(tokeniser->context.pending > 0);
+/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
+	assert(ctx.current_tag->name.len > 0);
+/*	assert(ctx.current_tag->name.ptr); */
 
-			hubbub_tokeniser_emit_token(tokeniser, &token);
-			parserutils_inputstream_advance(tokeniser->input, len);
+	error = parserutils_inputstream_peek(tokeniser->input, 
+			tokeniser->context.pending, &cptr, &len);
+
+	if (error != PARSERUTILS_OK) {
+		if (error == PARSERUTILS_EOF) {
+			/** \todo parse error */
+			tokeniser->state = STATE_RCDATA;
+			return HUBBUB_OK;
+		} else {
+			return hubbub_error_from_parserutils_error(error);
 		}
+	}
 
-		/* Reset for next time */
-		tokeniser->context.match_entity.complete = false;
+	c = *cptr;
 
+	if ((c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r')
+		&& ctx->close_tag_match.match == true) {
+		// Add condition for approproiate end tag token
+		tokeniser->context.pending += len;
+		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
+	} else if (c == '/' && ctx->close_tag_match.match == true) {
+		// Add condition for approproiate end tag token
+		tokeniser->context.pending += len;
+		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
+	} else if (c == '>' && ctx->close_tag_match.match == true) {
+		// Add condition for approproiate end tag token
+		tokeniser->context.pending += len;
 		tokeniser->state = STATE_DATA;
+		return emit_current_tag(tokeniser);
+	} else if ('A' <= c && c <= 'Z') {
+		uint8_t lc = (c + 0x20);
+		COLLECT(ctx->current_tag.name, &lc, len);
+		tokeniser->context.pending += len;
+	} else if ('a' <= c && c <= 'z') {
+		COLLECT(ctx->current_tag.name, cptr, len);
+		tokeniser->context.pending += len;
+	} else {
+		tokeniser->state = STATE_RCDATA;
+		return emit_current_chars(tokeniser);
 	}
 
 	return HUBBUB_OK;
@@ -807,9 +1686,9 @@ hubbub_error hubbub_tokeniser_handle_character_reference_data(
 
 /* this state always switches to another state straight away */
 /* this state expects the current character to be '<' */
-hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
+hubbub_error hubbub_tokeniser_handle_rawtext_lessthan(hubbub_tokeniser *tokeniser)
 {
-	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	//hubbub_tag *ctag = &tokeniser->context.current_tag;
 
 	size_t len;
 	const uint8_t *cptr;
@@ -827,7 +1706,7 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 			/** \todo parse error */
 			/* Emit single '<' char  */
 			emit_current_chars(tokeniser);
-			tokeniser->state = STATE_DATA;
+			tokeniser->state = STATE_RAWTEXT;
 			return HUBBUB_OK;
 		} else {
 			return hubbub_error_from_parserutils_error(error);
@@ -836,51 +1715,17 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 
 	c = *cptr;
 
-	if (c == '!') {
-		parserutils_inputstream_advance(tokeniser->input, SLEN("<!"));
-
-		tokeniser->context.pending = 0;
-		tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
-	} else if (c == '/'){
+	if (c == '/'){
 		tokeniser->context.pending += len;
 
 		tokeniser->context.close_tag_match.match = false;
 		tokeniser->context.close_tag_match.count = 0;
 
-		tokeniser->state = STATE_CLOSE_TAG_OPEN;
-	} else if ('A' <= c && c <= 'Z') {
-		uint8_t lc = (c + 0x20);
-
-		START_BUF(ctag->name, &lc, len);
-		ctag->n_attributes = 0;
-		tokeniser->context.current_tag_type =
-				HUBBUB_TOKEN_START_TAG;
-
-		tokeniser->context.pending += len;
-
-		tokeniser->state = STATE_TAG_NAME;
-	} else if ('a' <= c && c <= 'z') {
-		START_BUF(ctag->name, cptr, len);
-		ctag->n_attributes = 0;
-		tokeniser->context.current_tag_type =
-				HUBBUB_TOKEN_START_TAG;
-
-		tokeniser->context.pending += len;
-
-		tokeniser->state = STATE_TAG_NAME;
-	} else if (c == '?'){
-		/** \todo parse error */
-		/* Cursor still at "<", need to advance past it */
-			parserutils_inputstream_advance(
-					tokeniser->input, SLEN("<"));
-			tokeniser->context.pending = 0;
-
-			tokeniser->state = STATE_BOGUS_COMMENT;
+		tokeniser->state = STATE_RAWTEXT_CLOSE_TAG_OPEN;
 	} else {
-		/** \todo parse error */
 		/* Emit single '<' char  */
 		emit_current_chars(tokeniser);
-		tokeniser->state = STATE_DATA;
+		tokeniser->state = STATE_RCDATA;
 	}
 
 
@@ -889,8 +1734,10 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 
 /* this state expects tokeniser->context.chars to be "</" */
 /* this state never stays in this state for more than one character */
-hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
+hubbub_error hubbub_tokeniser_handle_rawtext_close_tag_open(hubbub_tokeniser *tokeniser)
 {
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
+
 	size_t len;
 	const uint8_t *cptr;
 	parserutils_error error;
@@ -900,65 +1747,84 @@ hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 /*	assert(tokeniser->context.chars.ptr[0] == '<'); */
 /*	assert(tokeniser->context.chars.ptr[1] == '/'); */
 
-	error = parserutils_inputstream_peek(tokeniser->input, 
-			tokeniser->context.pending, &cptr, &len);
+	uint8_t *start_tag_name =
+			tokeniser->context.last_start_tag_name;
+	size_t start_tag_len =
+		tokeniser->context.last_start_tag_len;
+
+	while ((error = parserutils_inputstream_peek(tokeniser->input,
+				ctx->pending +
+					ctx->close_tag_match.count,
+				&cptr,
+				&len)) == PARSERUTILS_OK) {
+		c = *cptr;
+
+		if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
+				!= (c & ~0x20)) {
+			break;
+		}
+
+		ctx->close_tag_match.count += len;
+
+		if (ctx->close_tag_match.count == start_tag_len) {
+
+			// Sets the flag to be used in name state.
+			ctx->close_tag_match.match = true;
+			break;
+		}
+	}
 
 	if (error != PARSERUTILS_OK) {
 		if (error == PARSERUTILS_EOF) {
-			/** \todo parse error */
-			/* Emit '</' chars  */
-			emit_current_chars(tokeniser);
-			tokeniser->state = STATE_DATA;
+			tokeniser->state = STATE_RAWTEXT;
+			tokeniser->context.pending += ctx->close_tag_match.count;
 			return HUBBUB_OK;
 		} else {
 			return hubbub_error_from_parserutils_error(error);
 		}
 	}
 
-	c = *cptr;
+	if (ctx->close_tag_match.match == true) {
+		error = parserutils_inputstream_peek(
+				 		tokeniser->input,
+				 		ctx->pending +
+					 		ctx->close_tag_match.count,
+						&cptr,
+				 		&len);
+
+		if (error != PARSERUTILS_OK && 
+				error != PARSERUTILS_EOF) {
+			return hubbub_error_from_parserutils_error(
+					error);
+		} else if (error != PARSERUTILS_EOF) {
+			c = *cptr;
 
-	if ('A' <= c && c <= 'Z') {
-		uint8_t lc = (c + 0x20);
-		START_BUF(tokeniser->context.current_tag.name,
-				&lc, len);
-		tokeniser->context.current_tag.n_attributes = 0;
+			if (c != '\t' && c != '\n' && c != '\f' && c != '\r' && 
+					c != ' ' && c != '>' &&
+					c != '/') {
+				ctx->close_tag_match.match = false;
+			}
+		}
+	}
 
-		tokeniser->context.current_tag_type = HUBBUB_TOKEN_END_TAG;
+	if (ctx->close_tag_match.match == true) {
 
-		tokeniser->context.pending += len;
+		tokeniser->state = STATE_RCDATA_CLOSE_TAG_NAME;
 
-		tokeniser->state = STATE_TAG_NAME;
-	} else if ('a' <= c && c <= 'z') {
-		START_BUF(tokeniser->context.current_tag.name,
-				cptr, len);
+		// Creates a new buffer and sets first charachter of the tag name 
+		START_BUF(ctx->current_tag.name,
+					&start_tag_name[0], len);
 		tokeniser->context.current_tag.n_attributes = 0;
 
-		tokeniser->context.current_tag_type = HUBBUB_TOKEN_END_TAG;
-
-		tokeniser->context.pending += len;
-
-		tokeniser->state = STATE_TAG_NAME;
-	} else if (c == '>') {
-		/** \todo parse error */
+		tokeniser->context.current_tag_type =
+				HUBBUB_TOKEN_END_TAG;
 
-		/* Cursor still at "</", need to collect ">" */
 		tokeniser->context.pending += len;
 
-		/* Now need to advance past "</>" */
-		parserutils_inputstream_advance(tokeniser->input,
-				tokeniser->context.pending);
-		tokeniser->context.pending = 0;
-
-		tokeniser->state = STATE_DATA;
+		tokeniser->state = STATE_RAWTEXT_CLOSE_TAG_NAME;
 	} else {
-		/** \todo parse error */
-
-		/* Cursor still at "</", need to advance past it */
-		parserutils_inputstream_advance(tokeniser->input,
-				tokeniser->context.pending);
-		tokeniser->context.pending = 0;
-
-		tokeniser->state = STATE_BOGUS_COMMENT;
+		emit_current_chars(tokeniser);
+		tokeniser->state = STATE_RAWTEXT;
 	}
 
 	return HUBBUB_OK;
@@ -966,9 +1832,9 @@ hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 
 /* this state expects tokeniser->context.current_tag to already have its
    first character set */
-hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
+hubbub_error hubbub_tokeniser_handle_rawtext_close_tag_name(hubbub_tokeniser *tokeniser)
 {
-	hubbub_tag *ctag = &tokeniser->context.current_tag;
+	hubbub_tokeniser_context *ctx = &tokeniser->context;
 
 	size_t len;
 	const uint8_t *cptr;
@@ -985,12 +1851,9 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
 
 	if (error != PARSERUTILS_OK) {
 		if (error == PARSERUTILS_EOF) {
-			/** \todo parse error */
-			tokeniser->state = STATE_DATA;
+			tokeniser->state = STATE_RAWTEXT;
 
-			// skips all pending charachters
-			parserutils_inputstream_advance(
-					tokeniser->input, tokeniser->context.pending);
+			emit_current_chars(tokeniser);
 			return HUBBUB_OK;
 		} else {
 			return hubbub_error_from_parserutils_error(error);
@@ -999,26 +1862,30 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
 
 	c = *cptr;
 
-	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+	if ((c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r')
+		&& ctx->close_tag_match.match == true) {
+		// Add condition for approproiate end tag token
 		tokeniser->context.pending += len;
 		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
-	} else if (c == '/') {
+	} else if (c == '/' && ctx->close_tag_match.match == true) {
+		// Add condition for approproiate end tag token
 		tokeniser->context.pending += len;
 		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
-	} else if (c == '>') {
+	} else if (c == '>' && ctx->close_tag_match.match == true) {
+		// Add condition for approproiate end tag token
 		tokeniser->context.pending += len;
 		tokeniser->state = STATE_DATA;
 		return emit_current_tag(tokeniser);
 	} else if ('A' <= c && c <= 'Z') {
 		uint8_t lc = (c + 0x20);
-		COLLECT(ctag->name, &lc, len);
+		COLLECT(ctx->current_tag.name, &lc, len);
 		tokeniser->context.pending += len;
-	} else if (c == '\0') {
-		COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
+	} else if ('a' <= c && c <= 'z') {
+		COLLECT(ctx->current_tag.name, cptr, len);
 		tokeniser->context.pending += len;
 	} else {
-		COLLECT(ctag->name, cptr, len);
-		tokeniser->context.pending += len;
+		tokeniser->state = STATE_RAWTEXT;
+		return emit_current_chars(tokeniser);
 	}
 
 	return HUBBUB_OK;
@@ -3315,7 +4182,7 @@ hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
 	err = hubbub_tokeniser_emit_token(tokeniser, &token);
 
 	if (token.type == HUBBUB_TOKEN_START_TAG) {
-		/* Save start tag name for R?CDATA */
+		/* Save start tag name for R?CDATA states */
 		if (token.data.tag.name.len <
 			sizeof(tokeniser->context.last_start_tag_name)) {
 			strncpy((char *) tokeniser->context.last_start_tag_name,
@@ -3328,8 +4195,9 @@ hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
 			tokeniser->context.last_start_tag_len = 0;
 		}
 	} else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ {
-		/* Reset content model after R?CDATA elements */
-		tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
+		/* Reset content model (i.e state will be now STATE_DATA)
+			 after R?CDATA elements */
+		tokeniser->state = STATE_DATA;
 	}
 
 	/* Reset the self-closing flag */
diff --git a/src/tokeniser/tokeniser.h b/src/tokeniser/tokeniser.h
index 5700923..cd8f662 100644
--- a/src/tokeniser/tokeniser.h
+++ b/src/tokeniser/tokeniser.h
@@ -25,7 +25,7 @@ typedef struct hubbub_tokeniser hubbub_tokeniser;
 typedef enum hubbub_tokeniser_opttype {
 	HUBBUB_TOKENISER_TOKEN_HANDLER,
 	HUBBUB_TOKENISER_ERROR_HANDLER,
-	HUBBUB_TOKENISER_CONTENT_MODEL,
+	HUBBUB_TOKENISER_INITIAL_STATE,
 	HUBBUB_TOKENISER_PROCESS_CDATA,
 	HUBBUB_TOKENISER_PAUSE
 } hubbub_tokeniser_opttype;
@@ -45,8 +45,8 @@ typedef union hubbub_tokeniser_optparams {
 	} error_handler;		/**< Error handling callback */
 
 	struct {
-		hubbub_content_model model;
-	} content_model;		/**< Current content model */
+		hubbub_initial_state state;
+	} initial_state;		/**< Initial State of the tokeniser */
 
 	bool process_cdata;		/**< Whether to process CDATA sections*/
 
diff --git a/src/treebuilder/in_body.c b/src/treebuilder/in_body.c
index 5157e66..d16a365 100644
--- a/src/treebuilder/in_body.c
+++ b/src/treebuilder/in_body.c
@@ -740,10 +740,10 @@ hubbub_error process_plaintext_in_body(hubbub_treebuilder *treebuilder,
 	if (err != HUBBUB_OK)
 		return err;
 
-	params.content_model.model = HUBBUB_CONTENT_MODEL_PLAINTEXT;
+	params.initial_state.state = HUBBUB_INITIAL_STATE_PLAINTEXT;
 
 	err = hubbub_tokeniser_setopt(treebuilder->tokeniser,
-			HUBBUB_TOKENISER_CONTENT_MODEL,
+			HUBBUB_TOKENISER_INITIAL_STATE,
 			&params);
 	assert(err == HUBBUB_OK);
 
diff --git a/src/treebuilder/treebuilder.c b/src/treebuilder/treebuilder.c
index a6a4b43..5784a83 100644
--- a/src/treebuilder/treebuilder.c
+++ b/src/treebuilder/treebuilder.c
@@ -473,10 +473,10 @@ hubbub_error parse_generic_rcdata(hubbub_treebuilder *treebuilder,
 	if (error != HUBBUB_OK)
 		return error;
 
-	params.content_model.model = rcdata ? HUBBUB_CONTENT_MODEL_RCDATA
-					    : HUBBUB_CONTENT_MODEL_CDATA;
+	params.initial_state.state = rcdata ? HUBBUB_INITIAL_STATE_RCDATA
+					    : HUBBUB_INITIAL_STATE_CDATA;
 	error = hubbub_tokeniser_setopt(treebuilder->tokeniser,
-				HUBBUB_TOKENISER_CONTENT_MODEL, &params);
+				HUBBUB_TOKENISER_INITIAL_STATE, &params);
 	/* There is no way that setopt can fail. Ensure this. */
 	assert(error == HUBBUB_OK);
 
diff --git a/test/data/tokeniser2/contentModelFlags.test b/test/data/tokeniser2/contentModelFlags.test
index 1dec3e8..a8b1695 100644
--- a/test/data/tokeniser2/contentModelFlags.test
+++ b/test/data/tokeniser2/contentModelFlags.test
@@ -1,73 +1,73 @@
 {"tests": [
 
 {"description":"PLAINTEXT content model flag",
-"contentModelFlags":["PLAINTEXT"],
+"initialStates":["PLAINTEXT state"],
 "lastStartTag":"plaintext",
 "input":"<head>&body;",
 "output":[["Character", "<head>&body;"]]},
 
-{"description":"End tag closing RCDATA or CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag closing RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp>",
 "output":[["Character", "foo"], ["EndTag", "xmp"]]},
 
-{"description":"End tag closing RCDATA or CDATA (case-insensitivity)",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag closing RCDATA or RAWTEXT (case-insensitivity)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xMp>",
 "output":[["Character", "foo"], ["EndTag", "xmp"]]},
 
-{"description":"End tag closing RCDATA or CDATA (ending with space)",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag closing RCDATA or RAWTEXT (ending with space)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp ",
-"output":[["Character", "foo"], "ParseError", ["EndTag", "xmp"]]},
+"output":[["Character", "foo"], "ParseError"]},
 
-{"description":"End tag closing RCDATA or CDATA (ending with EOF)",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag closing RCDATA or RAWTEXT (ending with EOF)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp",
-"output":[["Character", "foo"], "ParseError", ["EndTag", "xmp"]]},
+"output":[["Character", "foo</xmp"]]},
 
-{"description":"End tag closing RCDATA or CDATA (ending with slash)",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag closing RCDATA or RAWTEXT (ending with slash)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp/",
-"output":[["Character", "foo"], "ParseError", ["EndTag", "xmp"]]},
+"output":[["Character", "foo"], "ParseError"]},
 
-{"description":"End tag not closing RCDATA or CDATA (ending with left-angle-bracket)",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag not closing RCDATA or RAWTEXT (ending with left-angle-bracket)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp<",
 "output":[["Character", "foo</xmp<"]]},
 
-{"description":"End tag with incorrect name in RCDATA or CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag with incorrect name in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"</foo>bar</xmp>",
 "output":[["Character", "</foo>bar"], ["EndTag", "xmp"]]},
 
-{"description":"End tag with incorrect name in RCDATA or CDATA (starting like correct name)",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag with incorrect name in RCDATA or RAWTEXT (starting like correct name)",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"</foo>bar</xmpaar>",
 "output":[["Character", "</foo>bar</xmpaar>"]]},
 
-{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag closing RCDATA or RAWTEXT, switching back to PCDATA",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp></baz>",
 "output":[["Character", "foo"], ["EndTag", "xmp"], ["EndTag", "baz"]]},
 
-{"description":"CDATA w/ something looking like an entity",
-"contentModelFlags":["CDATA"],
+{"description":"RAWTEXT w/ something looking like an entity",
+"initialStates":["RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"&foo;",
 "output":[["Character", "&foo;"]]},
 
 {"description":"RCDATA w/ an entity",
-"contentModelFlags":["RCDATA"],
+"initialStates":["RCDATA state"],
 "lastStartTag":"textarea",
 "input":"&lt;",
 "output":[["Character", "<"]]}
diff --git a/test/data/tokeniser2/escapeFlag.test b/test/data/tokeniser2/escapeFlag.test
index 4c4bf51..18cb430 100644
--- a/test/data/tokeniser2/escapeFlag.test
+++ b/test/data/tokeniser2/escapeFlag.test
@@ -1,33 +1,33 @@
 {"tests": [
 
-{"description":"Commented close tag in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Commented close tag in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!--</xmp>--></xmp>",
-"output":[["Character", "foo<!--</xmp>-->"], ["EndTag", "xmp"]]},
+"output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], ["EndTag", "xmp"]]},
 
-{"description":"Bogus comment in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!-->baz</xmp>",
 "output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},
 
-{"description":"End tag surrounded by bogus comment in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!--></xmp><!-->baz</xmp>",
 "output":[["Character", "foo<!-->"], ["EndTag", "xmp"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]]},
 
 {"description":"Commented entities in RCDATA",
-"contentModelFlags":["RCDATA"],
+"initialStates":["RCDATA state"],
 "lastStartTag":"xmp",
 "input":" &amp; <!-- &amp; --> &amp; </xmp>",
-"output":[["Character", " & <!-- &amp; --> & "], ["EndTag", "xmp"]]},
+"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
 
-{"description":"Incorrect comment ending sequences in [R]CDATA",
-"contentModelFlags":["RCDATA", "CDATA"],
+{"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
+"initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
-"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<></xmp>"]]}
+"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}
 
 ]}
diff --git a/test/tokeniser2.c b/test/tokeniser2.c
index c320f42..d9bc3c1 100644
--- a/test/tokeniser2.c
+++ b/test/tokeniser2.c
@@ -27,7 +27,7 @@ typedef struct context {
 	size_t char_off;
 
 	const char *last_start_tag;
-	struct array_list *content_model;
+	struct array_list *initial_state;
 	bool process_cdata;
 } context;
 
@@ -65,7 +65,7 @@ int main(int argc, char **argv)
 			(struct json_object *) array_list_get_idx(tests, i);
 
 		ctx.last_start_tag = NULL;
-		ctx.content_model = NULL;
+		ctx.initial_state = NULL;
 		ctx.process_cdata = false;
 
 		/* Extract settings */
@@ -88,8 +88,8 @@ int main(int argc, char **argv)
 			} else if (strcmp(key, "lastStartTag") == 0) {
 				ctx.last_start_tag = (const char *)
 						json_object_get_string(val);
-			} else if (strcmp(key, "contentModelFlags") == 0) {
-				ctx.content_model =
+			} else if (strcmp(key, "initialStates") == 0) {
+				ctx.initial_state =
 						json_object_get_array(val);
 			} else if (strcmp(key, "processCDATA") == 0) {
 				ctx.process_cdata =
@@ -116,10 +116,10 @@ void run_test(context *ctx)
 	int i, max_i;
 	struct array_list *outputsave = ctx->output;
 
-	if (ctx->content_model == NULL) {
+	if (ctx->initial_state == NULL) {
 		max_i = 1;
 	} else {
-		max_i = array_list_length(ctx->content_model);
+		max_i = array_list_length(ctx->initial_state);
 	}
 
 	/* We test for each of the content models specified */
@@ -163,30 +163,35 @@ void run_test(context *ctx)
 				HUBBUB_TOKENISER_TOKEN_HANDLER,
 				&params) == HUBBUB_OK);
 
-		if (ctx->content_model == NULL) {
-			params.content_model.model =
-					HUBBUB_CONTENT_MODEL_PCDATA;
+		if (ctx->initial_state == NULL) {
+			params.initial_state.state =
+					HUBBUB_INITIAL_STATE_DATA;
 		} else {
 			const char *cm = json_object_get_string(
 				(struct json_object *)
-				array_list_get_idx(ctx->content_model, i));
+				array_list_get_idx(ctx->initial_state, i));
 
 			if (strcmp(cm, "PCDATA") == 0) {
-				params.content_model.model =
-						HUBBUB_CONTENT_MODEL_PCDATA;
-			} else if (strcmp(cm, "RCDATA") == 0) {
-				params.content_model.model =
-						HUBBUB_CONTENT_MODEL_RCDATA;
-			} else if (strcmp(cm, "CDATA") == 0) {
-				params.content_model.model =
-						HUBBUB_CONTENT_MODEL_CDATA;
+				params.initial_state.state =
+						HUBBUB_INITIAL_STATE_DATA;
+			} else if (strcmp(cm, "RCDATA state") == 0) {
+				
+				params.initial_state.state =
+						HUBBUB_INITIAL_STATE_RCDATA;
+			} else if (strcmp(cm, "CDATA state") == 0) {
+				params.initial_state.state =
+						HUBBUB_INITIAL_STATE_CDATA;
+			} else if (strcmp(cm, "RAWTEXT state") == 0) {
+			params.initial_state.state =
+						HUBBUB_INITIAL_STATE_RAWTEXT;
 			} else {
-				params.content_model.model =
-					HUBBUB_CONTENT_MODEL_PLAINTEXT;
+			params.initial_state.state =
+					HUBBUB_INITIAL_STATE_PLAINTEXT;
 			}
 		}
+
 		assert(hubbub_tokeniser_setopt(tok,
-				HUBBUB_TOKENISER_CONTENT_MODEL,
+				HUBBUB_TOKENISER_INITIAL_STATE,
 				&params) == HUBBUB_OK);
 
 		assert(parserutils_inputstream_append(stream,
-- 
1.8.3.2

