tokenizer.cpp 5.9 KB
Newer Older
李晓兵's avatar
李晓兵 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
/************************************************************************
The zlib/libpng License

Copyright (c) 2006 Joerg Wiedenmann

This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from
the use of this software.

Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:

1. The origin of this software must not be misrepresented;
you must not claim that you wrote the original software.
If you use this software in a product, an acknowledgment
in the product documentation would be appreciated but is
not required.

2. Altered source versions must be plainly marked as such,
and must not be misrepresented as being the original software.

3. This notice may not be removed or altered from any source distribution.

***********************************************************************/

/********************************************************************
	created:	2006-01-28
	filename: 	tokenizer.cpp
	author:		Jörg Wiedenmann
	
	purpose:	A tokenizer function which provides a very
				customizable way of breaking up strings.

	history:	2006-01-28, Original version
				2006-03-04, Fixed a small parsing bug, thanks Elias.
*********************************************************************/

#include "tokenizer.h"

using namespace std;

void tokenize ( const string& str, vector<string>& result,
			   const string& delimiters, const string& delimiters_preserve,
			   const string& quote, const string& esc )
{
	// clear the vector
	if ( false == result.empty() )
	{
		result.clear();
	}

	string::size_type pos = 0; // the current position (char) in the string
	char ch = 0; // buffer for the current character
	char delimiter = 0;	// the buffer for the delimiter char which
							// will be added to the tokens if the delimiter
							// is preserved
	char current_quote = 0; // the char of the current open quote
	bool quoted = false; // indicator if there is an open quote
	string token;  // string buffer for the token
	bool token_complete = false; // indicates if the current token is
								 // read to be added to the result vector
	string::size_type len = str.length();  // length of the input-string

	// for every char in the input-string
	while ( len > pos )
	{
		// get the character of the string and reset the delimiter buffer
		ch = str.at(pos);
		delimiter = 0;

		// assume ch isn't a delimiter
		bool add_char = true;

		// check ...

		// ... if the delimiter is an escaped character
		bool escaped = false; // indicates if the next char is protected
		if ( false == esc.empty() ) // check if esc-chars are  provided
		{
			if ( string::npos != esc.find_first_of(ch) )
			{
				// get the escaped char
				++pos;
				if ( pos < len ) // if there are more chars left
				{
					// get the next one
					ch = str.at(pos);

					// add the escaped character to the token
					add_char = true;
				}
				else // cannot get any more characters
				{
					// don't add the esc-char
					add_char = false;
				}

				// ignore the remaining delimiter checks
				escaped = true;
			}
		}

		// ... if the delimiter is a quote
		if ( false == quote.empty() && false == escaped )
		{
			// if quote chars are provided and the char isn't protected
			if ( string::npos != quote.find_first_of(ch) )
			{
				// if not quoted, set state to open quote and set
				// the quote character
				if ( false == quoted )
				{
					quoted = true;
					current_quote = ch;

					// don't add the quote-char to the token
					add_char = false;
				}
				else // if quote is open already
				{
					// check if it is the matching character to close it
					if ( current_quote == ch )
					{
						// close quote and reset the quote character
						quoted = false;
						current_quote = 0;

						// don't add the quote-char to the token
						add_char = false;
					}
				} // else
			}
		}

		// ... if the delimiter isn't preserved
		if ( false == delimiters.empty() && false == escaped &&
			 false == quoted )
		{
			// if a delimiter is provided and the char isn't protected by
			// quote or escape char
			if ( string::npos != delimiters.find_first_of(ch) )
			{
				// if ch is a delimiter and the token string isn't empty
				// the token is complete
				if ( false == token.empty() ) // BUGFIX: 2006-03-04
				{
					token_complete = true;
				}

				// don't add the delimiter to the token
				add_char = false;
			}
		}

		// ... if the delimiter is preserved - add it as a token
		bool add_delimiter = false;
		if ( false == delimiters_preserve.empty() && false == escaped &&
			 false == quoted )
		{
			// if a delimiter which will be preserved is provided and the
			// char isn't protected by quote or escape char
			if ( string::npos != delimiters_preserve.find_first_of(ch) )
			{
				// if ch is a delimiter and the token string isn't empty
				// the token is complete
				if ( false == token.empty() ) // BUGFIX: 2006-03-04
				{
					token_complete = true;
				}

				// don't add the delimiter to the token
				add_char = false;

				// add the delimiter
				delimiter = ch;
				add_delimiter = true;
			}
		}


		// add the character to the token
		if ( true == add_char )
		{
			// add the current char
			token.push_back( ch );
		}

		// add the token if it is complete
		if ( true == token_complete && false == token.empty() )
		{
			// add the token string
			result.push_back( token );

			// clear the contents
			token.clear();

			// build the next token
			token_complete = false;
		}

		// add the delimiter
		if ( true == add_delimiter )
		{
			// the next token is the delimiter
			string delim_token;
			delim_token.push_back( delimiter );
			result.push_back( delim_token );

			// REMOVED: 2006-03-04, Bugfix
		}

		// repeat for the next character
		++pos;
	} // while

	// add the final token
	if ( false == token.empty() )
	{
		result.push_back( token );
	}
}