tokenizer.cpp

/************************************************************************
The zlib/libpng License

Copyright (c) 2006 Joerg Wiedenmann

This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from
the use of this software.

Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:

1. The origin of this software must not be misrepresented;
you must not claim that you wrote the original software.
If you use this software in a product, an acknowledgment
in the product documentation would be appreciated but is
not required.

2. Altered source versions must be plainly marked as such,
and must not be misrepresented as being the original software.

3. This notice may not be removed or altered from any source distribution.

***********************************************************************/

/********************************************************************
	created:	2006-01-28
	filename: 	tokenizer.cpp
	author:		Jörg Wiedenmann
	
	purpose:	A tokenizer function which provides a very
				customizable way of breaking up strings.

	history:	2006-01-28, Original version
				2006-03-04, Fixed a small parsing bug, thanks Elias.
*********************************************************************/

#include "tokenizer.h"

using namespace std;

void tokenize ( const string& str, vector<string>& result,
			   const string& delimiters, const string& delimiters_preserve,
			   const string& quote, const string& esc )
{
	// clear the vector
	if ( false == result.empty() )
	{
		result.clear();
	}

	string::size_type pos = 0; // the current position (char) in the string
	char ch = 0; // buffer for the current character
	char delimiter = 0;	// the buffer for the delimiter char which
							// will be added to the tokens if the delimiter
							// is preserved
	char current_quote = 0; // the char of the current open quote
	bool quoted = false; // indicator if there is an open quote
	string token;  // string buffer for the token
	bool token_complete = false; // indicates if the current token is
								 // read to be added to the result vector
	string::size_type len = str.length();  // length of the input-string

	// for every char in the input-string
	while ( len > pos )
	{
		// get the character of the string and reset the delimiter buffer
		ch = str.at(pos);
		delimiter = 0;

		// assume ch isn't a delimiter
		bool add_char = true;

		// check ...

		// ... if the delimiter is an escaped character
		bool escaped = false; // indicates if the next char is protected
		if ( false == esc.empty() ) // check if esc-chars are  provided
		{
			if ( string::npos != esc.find_first_of(ch) )
			{
				// get the escaped char
				++pos;
				if ( pos < len ) // if there are more chars left
				{
					// get the next one
					ch = str.at(pos);

					// add the escaped character to the token
					add_char = true;
				}
				else // cannot get any more characters
				{
					// don't add the esc-char
					add_char = false;
				}

				// ignore the remaining delimiter checks
				escaped = true;
			}
		}

		// ... if the delimiter is a quote
		if ( false == quote.empty() && false == escaped )
		{
			// if quote chars are provided and the char isn't protected
			if ( string::npos != quote.find_first_of(ch) )
			{
				// if not quoted, set state to open quote and set
				// the quote character
				if ( false == quoted )
				{
					quoted = true;
					current_quote = ch;

					// don't add the quote-char to the token
					add_char = false;
				}
				else // if quote is open already
				{
					// check if it is the matching character to close it
					if ( current_quote == ch )
					{
						// close quote and reset the quote character
						quoted = false;
						current_quote = 0;

						// don't add the quote-char to the token
						add_char = false;
					}
				} // else
			}
		}

		// ... if the delimiter isn't preserved
		if ( false == delimiters.empty() && false == escaped &&
			 false == quoted )
		{
			// if a delimiter is provided and the char isn't protected by
			// quote or escape char
			if ( string::npos != delimiters.find_first_of(ch) )
			{
				// if ch is a delimiter and the token string isn't empty
				// the token is complete
				if ( false == token.empty() ) // BUGFIX: 2006-03-04
				{
					token_complete = true;
				}

				// don't add the delimiter to the token
				add_char = false;
			}
		}

		// ... if the delimiter is preserved - add it as a token
		bool add_delimiter = false;
		if ( false == delimiters_preserve.empty() && false == escaped &&
			 false == quoted )
		{
			// if a delimiter which will be preserved is provided and the
			// char isn't protected by quote or escape char
			if ( string::npos != delimiters_preserve.find_first_of(ch) )
			{
				// if ch is a delimiter and the token string isn't empty
				// the token is complete
				if ( false == token.empty() ) // BUGFIX: 2006-03-04
				{
					token_complete = true;
				}

				// don't add the delimiter to the token
				add_char = false;

				// add the delimiter
				delimiter = ch;
				add_delimiter = true;
			}
		}


		// add the character to the token
		if ( true == add_char )
		{
			// add the current char
			token.push_back( ch );
		}

		// add the token if it is complete
		if ( true == token_complete && false == token.empty() )
		{
			// add the token string
			result.push_back( token );

			// clear the contents
			token.clear();

			// build the next token
			token_complete = false;
		}

		// add the delimiter
		if ( true == add_delimiter )
		{
			// the next token is the delimiter
			string delim_token;
			delim_token.push_back( delimiter );
			result.push_back( delim_token );

			// REMOVED: 2006-03-04, Bugfix
		}

		// repeat for the next character
		++pos;
	} // while

	// add the final token
	if ( false == token.empty() )
	{
		result.push_back( token );
	}
}