2 * exml.c -- A simple SAX style XML parser
5 /********************************* Description ********************************/
7 * This is a recursive descent parser for XML text files. It is a one-pass
8 * simple parser that invokes a user supplied callback for key tokens in the
9 * XML file. The user supplies a read function so that XML files can be parsed
10 * from disk or in-memory.
12 /********************************** Includes **********************************/
16 /****************************** Forward Declarations **************************/
18 #if BLD_FEATURE_EXML || 1
20 static int parseNext(Exml *xp, int state);
21 static ExmlToken getToken(Exml *xp, int state);
22 static int getNextChar(Exml *xp);
23 static int scanFor(Exml *xp, char *str);
24 static int putLastChar(Exml *xp, int c);
25 static void error(Exml *xp, char *fmt, ...);
26 static void trimToken(Exml *xp);
28 /************************************ Code ************************************/
30 Exml *exmlOpen(MprCtx ctx, int initialSize, int maxSize)
34 xp = mprAllocTypeZeroed(ctx, Exml);
36 xp->inBuf = mprCreateBuf(xp, EXML_BUFSIZE, EXML_BUFSIZE);
37 xp->tokBuf = mprCreateBuf(xp, initialSize, maxSize);
42 /******************************************************************************/
44 void exmlClose(Exml *xp)
51 /******************************************************************************/
53 void exmlSetParserHandler(Exml *xp, ExmlHandler h)
60 /******************************************************************************/
62 void exmlSetInputStream(Exml *xp, ExmlInputStream s, void *arg)
70 /******************************************************************************/
75 void exmlSetParseArg(Exml *xp, void *parseArg)
79 xp->parseArg = parseArg;
82 /******************************************************************************/
87 void *exmlGetParseArg(Exml *xp)
94 /******************************************************************************/
96 * Parse an XML file. Return 0 for success, -1 for error.
99 int exmlParse(Exml *xp)
103 return parseNext(xp, EXML_BEGIN);
106 /******************************************************************************/
108 * XML parser. This is a recursive descent parser. Return -1 for errors, 0 for
109 * EOF and 1 if there is still more data to parse.
112 static int parseNext(Exml *xp, int state)
120 mprAssert(state >= 0);
123 handler = xp->handler;
128 * In this parse loop, the state is never assigned EOF or ERR. In
129 * such cases we always return EOF or ERR.
133 token = getToken(xp, state);
135 if (token == TOKEN_TOO_BIG) {
136 error(xp, "XML token is too big");
141 case EXML_BEGIN: /* ------------------------------------------ */
143 * Expect to get an element, comment or processing instruction
151 * Recurse to handle the new element, comment etc.
153 rc = parseNext(xp, EXML_AFTER_LS);
160 error(xp, "Syntax error");
165 case EXML_AFTER_LS: /* ------------------------------------------ */
168 state = EXML_COMMENT;
169 rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf));
178 rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf));
185 case TOKEN_INSTRUCTIONS:
186 /* Just ignore processing instructions */
191 state = EXML_NEW_ELT;
192 tname = mprStrdup(xp, mprGetBufStart(tokBuf));
197 rc = (*handler)(xp, state, tname, 0, 0);
204 error(xp, "Syntax error");
209 case EXML_NEW_ELT: /* ------------------------------------------ */
211 * We have seen the opening "<element" for a new element and have
212 * not yet seen the terminating ">" of the opening element.
217 * Must be an attribute name
219 aname = mprStrdup(xp, mprGetBufStart(tokBuf));
220 token = getToken(xp, state);
221 if (token != TOKEN_EQ) {
222 error(xp, "Missing assignment for attribute \"%s\"", aname);
226 token = getToken(xp, state);
227 if (token != TOKEN_TEXT) {
228 error(xp, "Missing value for attribute \"%s\"", aname);
231 state = EXML_NEW_ATT;
232 rc = (*handler)(xp, state, tname, aname,
233 mprGetBufStart(tokBuf));
237 state = EXML_NEW_ELT;
242 * This is ">" the termination of the opening element
244 if (*tname == '\0') {
245 error(xp, "Missing element name");
250 * Tell the user that the opening element is now complete
252 state = EXML_ELT_DEFINED;
253 rc = (*handler)(xp, state, tname, 0, 0);
257 state = EXML_ELT_DATA;
262 * If we see a "/>" then this is a solo element
264 if (*tname == '\0') {
265 error(xp, "Missing element name");
268 state = EXML_SOLO_ELT_DEFINED;
269 rc = (*handler)(xp, state, tname, 0, 0);
277 error(xp, "Syntax error");
282 case EXML_ELT_DATA: /* -------------------------------------- */
284 * We have seen the full opening element "<name ...>" and now
285 * await data or another element.
287 if (token == TOKEN_LS) {
289 * Recurse to handle the new element, comment etc.
291 rc = parseNext(xp, EXML_AFTER_LS);
297 } else if (token == TOKEN_LS_SLASH) {
298 state = EXML_END_ELT;
301 } else if (token != TOKEN_TEXT) {
304 if (mprGetBufLength(tokBuf) > 0) {
306 * Pass the data between the element to the user
308 rc = (*handler)(xp, state, tname, 0, mprGetBufStart(tokBuf));
315 case EXML_END_ELT: /* -------------------------------------- */
316 if (token != TOKEN_TEXT) {
317 error(xp, "Missing closing element name for \"%s\"", tname);
321 * The closing element name must match the opening element name
323 if (strcmp(tname, mprGetBufStart(tokBuf)) != 0) {
325 "Closing element name \"%s\" does not match on line %d"
326 "opening name \"%s\"",
327 mprGetBufStart(tokBuf), xp->lineNumber, tname);
330 rc = (*handler)(xp, state, tname, 0, 0);
334 if (getToken(xp, state) != TOKEN_GR) {
335 error(xp, "Syntax error");
340 case EXML_EOF: /* ---------------------------------------------- */
343 case EXML_ERR: /* ---------------------------------------------- */
360 /******************************************************************************/
362 * Lexical analyser for XML. Return the next token reading input as required.
363 * It uses a one token look ahead and push back mechanism (LAR1 parser).
364 * Text token identifiers are left in the tokBuf parser buffer on exit.
365 * This Lex has special cases for the states EXML_ELT_DATA where we
366 * have an optimized read of element data, and EXML_AFTER_LS where we
367 * distinguish between element names, processing instructions and comments.
370 static ExmlToken getToken(Exml *xp, int state)
372 MprBuf *tokBuf, *inBuf;
379 mprAssert(state >= 0);
381 if ((c = getNextChar(xp)) < 0) {
387 * Special case parsing for names and for element data. We do this for
388 * performance so we can return to the caller the largest token possible
390 if (state == EXML_ELT_DATA) {
392 * Read all the data up to the start of the closing element "<" or the
393 * start of a sub-element.
397 if ((c = getNextChar(xp)) < 0) {
403 if ((c = getNextChar(xp)) < 0) {
407 return TOKEN_LS_SLASH;
413 if (mprPutCharToBuf(tokBuf, c) < 0) {
414 return TOKEN_TOO_BIG;
416 if ((c = getNextChar(xp)) < 0) {
422 * Put back the last look-ahead character
427 * If all white space, then zero the token buffer
429 for (cp = tokBuf->start; *cp; cp++) {
447 if ((c = getNextChar(xp)) < 0) {
451 return TOKEN_LS_SLASH;
463 if ((c = getNextChar(xp)) < 0) {
467 return TOKEN_SLASH_GR;
478 * We handle element names, attribute names and attribute values
479 * here. We do NOT handle data between elements here. Read the
480 * token. Stop on white space or a closing element ">"
483 if ((c = getNextChar(xp)) < 0) {
486 while (c != xp->quoteChar) {
487 if (mprPutCharToBuf(tokBuf, c) < 0) {
488 return TOKEN_TOO_BIG;
490 if ((c = getNextChar(xp)) < 0) {
497 while (!isspace(c) && c != '>' && c != '/' && c != '=') {
498 if (mprPutCharToBuf(tokBuf, c) < 0) {
499 return TOKEN_TOO_BIG;
501 if ((c = getNextChar(xp)) < 0) {
507 if (mprGetBufLength(tokBuf) <= 0) {
510 mprAddNullToBuf(tokBuf);
512 if (state == EXML_AFTER_LS) {
514 * If we are just inside an element "<", then analyze what we
515 * have to see if we have an element name, instruction or
516 * comment. Tokbuf will hold "?" for instructions or "!--"
519 if (mprLookAtNextCharInBuf(tokBuf) == '?') {
520 /* Just ignore processing instructions */
521 rc = scanFor(xp, "?>");
523 return TOKEN_TOO_BIG;
524 } else if (rc == 0) {
527 return TOKEN_INSTRUCTIONS;
529 } else if (mprLookAtNextCharInBuf(tokBuf) == '!') {
531 * First discard the comment leadin "!--" and eat leading
534 if (strcmp((char*) tokBuf->start, "![CDATA[") == 0) {
537 c = mprLookAtNextCharInBuf(inBuf);
539 if ((c = getNextChar(xp)) < 0) {
542 c = mprLookAtNextCharInBuf(inBuf);
545 rc = scanFor(xp, "]]>");
547 return TOKEN_TOO_BIG;
548 } else if (rc == 0) {
556 c = mprLookAtNextCharInBuf(inBuf);
558 if ((c = getNextChar(xp)) < 0) {
561 c = mprLookAtNextCharInBuf(inBuf);
564 rc = scanFor(xp, "-->");
566 return TOKEN_TOO_BIG;
567 } else if (rc == 0) {
570 return TOKEN_COMMENT;
577 if ((c = getNextChar(xp)) < 0) {
582 /* Should never get here */
587 /******************************************************************************/
589 * Scan for a pattern. Eat and discard input up to the pattern. Return 1 if
590 * the pattern was found, return 0 if not found. Return < 0 on errors.
593 static int scanFor(Exml *xp, char *str)
604 for (cp = str; *cp; cp++) {
605 if ((c = getNextChar(xp)) < 0) {
609 if (mprPutCharToBuf(tokBuf, c) < 0) {
619 * Remove the pattern from the tokBuf
622 mprAdjustBufEnd(tokBuf, -(int) strlen(str));
630 /******************************************************************************/
632 * Get another character. We read and buffer blocks of data if we need more
636 static int getNextChar(Exml *xp)
643 if (mprGetBufLength(inBuf) <= 0) {
645 * Flush to reset the servp/endp pointers to the start of the buffer
646 * so we can do a maximal read
649 l = (xp->readFn)(xp, xp->inputArg, mprGetBufStart(inBuf),
650 mprGetBufLinearSpace(inBuf));
654 mprAdjustBufEnd(inBuf, l);
656 c = mprGetCharFromBuf(inBuf);
664 /******************************************************************************/
666 * Put back a character in the input buffer
669 static int putLastChar(Exml *xp, int c)
671 if (mprInsertCharToBuf(xp->inBuf, (char) c) < 0) {
681 /******************************************************************************/
683 * Output a parse message
686 static void error(Exml *xp, char *fmt, ...)
694 mprAllocVsprintf(MPR_LOC_ARGS(xp), &buf, MPR_MAX_STRING, fmt, args);
698 * MOB need to add the failing line text and a pointer to which column
701 mprAllocSprintf(MPR_LOC_ARGS(xp), &xp->errMsg, MPR_MAX_STRING,
702 "XML error: %s\nAt line %d\n", buf, xp->lineNumber);
707 /******************************************************************************/
709 * Remove trailing whitespace in a token and ensure it is terminated with
710 * a NULL for easy parsing
713 static void trimToken(Exml *xp)
715 while (isspace(mprLookAtLastCharInBuf(xp->tokBuf))) {
716 mprAdjustBufEnd(xp->tokBuf, -1);
718 mprAddNullToBuf(xp->tokBuf);
721 /******************************************************************************/
723 const char *exmlGetErrorMsg(Exml *xp)
725 if (xp->errMsg == 0) {
731 /******************************************************************************/
733 int exmlGetLineNumber(Exml *xp)
735 return xp->lineNumber;
738 /******************************************************************************/
741 void exmlParserDummy() {}
742 #endif /* BLD_FEATURE_EXML */
750 * vim600: sw=4 ts=4 fdm=marker