00001 // See ../../license.txt for license information. 00002 // 00003 // parse.cpp 00004 // 00005 // NOTES 00006 // XML Parser for the persistence framework. 00007 // 00008 // 30-Jun-2003 phamilton Created 00009 // 00010 00011 #define PERSIST_IN_LIBRARY_SOURCE 00012 00013 #include "parse.hpp" 00014 #include <iostream> 00015 #include <fstream> 00016 #include "expat-1.95.5/lib/expat.h" 00017 #include "boost/lexical_cast.hpp" 00018 #include "boost/format.hpp" 00019 00020 using namespace ph::persist::xml; 00021 00022 // format strings. 00023 const char XMLFmt_error[] = "file: %s %s at line %d"; 00024 const char XMLFmt_expected[] = "expected %s=\"string\""; 00025 00026 // common possible error messages. 00027 const char XMLErr_mismatched_end_tag[] = "mismatched end element tag."; 00028 00029 #define PROGRESS_UNIT 512 00030 #define BUFFER_SIZE 1024 00031 00032 bool parse::parse_xml(std::istream *stream, const std::string &streampath, parse *parser, parse_progress *progress) 00033 { 00034 00035 if (progress) 00036 { 00037 stream->seekg(0, std::ios_base::end); 00038 long len = stream->tellg(); 00039 long count = len / PROGRESS_UNIT; 00040 progress->total(count > 0 ? count : 1); 00041 } 00042 00043 if (progress) 00044 progress->progress(0); 00045 00046 parser->startparse(streampath); 00047 int done = 0; 00048 long total = 0; 00049 bool parseresult = true; 00050 while (!done) 00051 { 00052 char buf[BUFFER_SIZE]; 00053 stream->read(buf, sizeof(buf)); 00054 long len = stream->gcount(); 00055 done = len < (long)sizeof(buf); 00056 int error = parser->doparse(buf, len, done); 00057 if (error != PARSE_SUCCESS) 00058 { 00059 parseresult = false; 00060 done = 1; 00061 } 00062 total += len; 00063 if (progress) 00064 { 00065 long p = total / PROGRESS_UNIT; 00066 progress->progress(p); 00067 } 00068 } 00069 parser->endparse(); 00070 00071 return parseresult; 00072 } 00073 00074 void parse::startparse(const std::string &streamname) 00075 /* 00076 Called at the start of a parse. Set's to the expat 00077 data structures. 00078 */ 00079 { 00080 assert(_parser == NULL); 00081 _parser = XML_ParserCreate(NULL); 00082 _filename = streamname; // for error messages. 00083 _error = PARSE_SUCCESS; 00084 00085 XML_SetUserData(_parser, this); 00086 XML_SetElementHandler(_parser, sstartelement_handler, sendelement_handler); 00087 XML_SetCharacterDataHandler(_parser, scdata_handler); 00088 XML_SetCommentHandler(_parser, scomment_handler); 00089 XML_SetDefaultHandler(_parser, sdefault_handler); 00090 } 00091 00092 int parse::doparse(char *buf, long len, int done) 00093 /* 00094 Called in the middle of a parse. Feed more XML into the 00095 parser. 00096 */ 00097 { 00098 try 00099 { 00100 if (XML_Parse(_parser, buf, len, done)) 00101 return PARSE_SUCCESS; 00102 } 00103 catch (...) 00104 { 00105 // the only thrown exception is to end the parse. 00106 _error = PARSE_BADXMLTYPE; 00107 return _error; 00108 } 00109 00110 // for some reason, the string returned is not actually UNICODE! So we convert it to unicode here. 00111 /* TBD 00112 CWStr m; 00113 m.Convert(string); 00114 00115 XMLError(m.c_str()); 00116 */ 00117 return _error; 00118 } 00119 00120 void parse::endparse() 00121 /* 00122 Called at the end of a parse, or to abort a parse. 00123 */ 00124 { 00125 finish_handler(); 00126 XML_ParserFree(_parser); 00127 _parser = NULL; 00128 } 00129 00130 int parse::parsestream(std::istream *stream, const std::string &streamname) 00131 /* 00132 Wrapper function to parse a file of XML. 00133 */ 00134 { 00135 startparse(streamname); 00136 00137 int done = 0; 00138 while (!done) 00139 { 00140 char buf[BUFFER_SIZE]; 00141 stream->read(buf, sizeof(buf)); 00142 long len = stream->gcount(); 00143 done = len < (long)sizeof(buf); 00144 int error = doparse(buf, len, done); 00145 if (error != PARSE_SUCCESS) 00146 return _error; 00147 } 00148 endparse(); 00149 00150 return _error; 00151 } 00152 00153 int parse::doparsefile(const std::string &filename) 00154 /* 00155 Parse a file given a filename of a file on disk. 00156 */ 00157 { 00158 int result = PARSE_NOFILE; 00159 std::ifstream f(filename.c_str()); 00160 if (f.is_open()) 00161 { 00162 result = parsestream(&f, filename); 00163 f.close(); 00164 } 00165 return result; 00166 } 00167 00168 void parse::sstartelement_handler(void *userData, const XML_Char *name, const XML_Char **atts) 00169 { 00170 parse *me = reinterpret_cast<parse *>(userData); 00171 00172 // push this element. 00173 me->_elementstack.push_back(name); 00174 00175 xmlstring n(name); 00176 std::vector<xmlstring> a; 00177 if (atts) 00178 for (int i=0; atts[i]; i++) 00179 a.push_back(atts[i]); 00180 me->startelement_handler(n, a); 00181 } 00182 00183 void parse::sendelement_handler(void *userData, const XML_Char *name) 00184 { 00185 parse *me = reinterpret_cast<parse *>(userData); 00186 00187 me->endelement_handler(name); 00188 00189 // pop the element. 00190 if (me->_elementstack.back() == name) 00191 me->_elementstack.pop_back(); 00192 else 00193 me->error(XMLErr_mismatched_end_tag); 00194 } 00195 00196 void parse::scdata_handler(void *userData, const XML_Char *s, int len) 00197 { 00198 parse *me = reinterpret_cast<parse *>(userData); 00199 00200 // use the length in conversion. 00201 xmlstring ws(s, len); 00202 00203 me->cdata_handler(ws, len); 00204 } 00205 00206 void parse::scomment_handler(void *userData, const XML_Char *data) 00207 { 00208 parse *me = reinterpret_cast<parse *>(userData); 00209 00210 // use the length in conversion. 00211 xmlstring ws(data); 00212 00213 me->comment_handler(data); 00214 } 00215 00216 void parse::sdefault_handler(void *userData, const XML_Char *s, int len) 00217 { 00218 parse *me = reinterpret_cast<parse *>(userData); 00219 00220 // use the length in conversion. 00221 xmlstring ws(s, len); 00222 00223 me->default_handler(ws, len); 00224 } 00225 00226 xmlstring parse::attr(const std::vector<xmlstring> &attrs, int index) 00227 { 00228 if ((int)attrs.size() > (index * 2)) 00229 return attrs[index * 2]; 00230 return S(""); 00231 } 00232 00233 xmlstring parse::attrval(const std::vector<xmlstring> &attrs, int index) 00234 { 00235 if ((int)attrs.size() > ((index * 2) + 1)) 00236 return attrs[(index * 2) + 1]; 00237 return S(""); 00238 } 00239 00240 xmlstring parse::attr(const std::vector<xmlstring> &attrs, const xmlstring &token) 00241 { 00242 // these are processed in pairs. So a simple for() is best here. 00243 for (int i=0; i < (int)attrs.size(); i++) 00244 { 00245 if (token == attrs[i]) 00246 return attrs[i+1]; 00247 i++; 00248 } 00249 00250 return S(""); 00251 } 00252 00253 xmlstring parse::expectedattr(const std::vector<xmlstring> &attrs, const xmlstring &token) 00254 { 00255 xmlstring a = attr(attrs, token); 00256 if (!a.empty()) 00257 return a; 00258 00259 expected_error(token); 00260 return S(""); 00261 } 00262 00263 void parse::expected_error(const xmlstring &token) 00264 { 00265 error(boost::io::str(boost::format(XMLFmt_expected) % boost::lexical_cast<std::string>(token))); 00266 } 00267 00268 void parse::error(const std::string &s, bool detail) 00269 /* 00270 Default error does a message box. 00271 */ 00272 { 00273 if (!_silent) 00274 { 00275 if (_errorhandler) 00276 { 00277 if (detail) 00278 { 00279 // on debian, the direct version of this that uses a stream doesn't seem to work, so just cast to a string 00280 // for now. 00281 *_errorhandler << boost::io::str(boost::format(XMLFmt_error) % _filename % s % XML_GetCurrentLineNumber(_parser)) << std::endl; 00282 } 00283 else 00284 *_errorhandler << s << std::endl; 00285 } 00286 } 00287 00288 _error = PARSE_XMLERROR; 00289 } 00290 00291 void parse::error(const std::string &format, const std::string &s1, bool detail) 00292 { 00293 error(boost::io::str(boost::format(format) % s1), detail); 00294 } 00295 00296 void parse::error(const std::string &format, const std::string &s1, const std::string &s2, bool detail) 00297 { 00298 error(boost::io::str(boost::format(format) % s1 % s2), detail); 00299 } 00300 00301 // we escape all data with 2 sets of this char... 00302 const char kEscapeChar = '\\'; 00303 00304 // and here are the things that we escape. 00305 static struct { char c; const char *s; } gXMLEncodingTable[] = 00306 { 00307 { '<', S("lt") }, 00308 { '>', S("gt") }, 00309 { '&', S("amp") }, 00310 { 0, 0 } 00311 }; 00312 00313 // some helper strings. The short header is used as a way of telling whether a particular 00314 // string contains XML or not. 00315 const xmlstring kXMLShortHeader = S("<?xml version=\"1.0\""); 00316 const xmlstring kXMLLongHeader = S("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>"); 00317 00318 bool parse::encodexmldata(const xmlstring &s, xmlstring *news) 00319 { 00320 *news = S(""); 00321 for (xmlstring::const_iterator i = s.begin(); i != s.end(); i++) 00322 { 00323 if (*i == kEscapeChar) 00324 { 00325 *news += kEscapeChar; 00326 *news += kEscapeChar; 00327 00328 } 00329 else 00330 { 00331 int j=0; 00332 while (gXMLEncodingTable[j].c && gXMLEncodingTable[j].c != *i) 00333 j++; 00334 if (gXMLEncodingTable[j].c) 00335 { 00336 *news += kEscapeChar; 00337 *news += gXMLEncodingTable[j].s; 00338 *news += kEscapeChar; 00339 } 00340 else 00341 *news += *i; 00342 } 00343 } 00344 00345 return s.length() != news->length(); 00346 } 00347 00348 bool parse::decodexmldata(const xmlstring &s, xmlstring *news) 00349 { 00350 // if the string is actually a chunk of XML, then we don't decode (we are already 00351 // decoded). 00352 if (s.substr(0, kXMLShortHeader.length()) == kXMLShortHeader) 00353 return false; 00354 00355 bool escape = false; 00356 xmlstring escdata; 00357 *news = S(""); 00358 for (xmlstring::const_iterator i = s.begin(); i != s.end(); i++) 00359 { 00360 if (escape) 00361 { 00362 if (*i == kEscapeChar) 00363 { 00364 if (escdata == S("")) 00365 *news += kEscapeChar; 00366 else 00367 { 00368 // finished escaping. 00369 int j=0; 00370 while (gXMLEncodingTable[j].c && gXMLEncodingTable[j].s != escdata) 00371 j++; 00372 if (gXMLEncodingTable[j].c) 00373 *news += gXMLEncodingTable[j].c; 00374 else 00375 *news += escdata; 00376 } 00377 escape = false; 00378 } 00379 else 00380 escdata += *i; 00381 } 00382 else if (*i == kEscapeChar) 00383 { 00384 escdata = S(""); 00385 escape = true; 00386 } 00387 else 00388 *news += *i; 00389 } 00390 00391 if (escape) 00392 *news += escdata; 00393 00394 return s.length() != news->length(); 00395 } 00396