[BACK] Return to SGML.c CVS log [TXT] [DIR] Up to [Public] / libwww / Library / src

Annotation of libwww/Library/src/SGML.c, revision 1.1

1.1 ! timbl 1: /*           General SGML Parser code        SGML.c
 ! 2: **           ========================
 ! 3: **
 ! 4: **   This module implements an HTSGMLContext object. To parse an
 ! 5: **   SGML file, create this object which is a parser. The object
 ! 6: **   is (currently) created by being parsed a DTD structure.
 ! 7: **
 ! 8: **   
 ! 9: */
 ! 10: #include "SGML.h"
 ! 11: 
 ! 12: #include <ctype.h>
 ! 13: #include <stdio.h>
 ! 14: #include "HTUtils.h"
 ! 15: #include "HTChunk.h"
 ! 16: #include "tcp.h"        /* For FROMASCII */
 ! 17: 
 ! 18: /*   The State (context) of the parser
 ! 19: **
 ! 20: **   This is passed with each call to make the parser recursive
 ! 21: **
 ! 22: */
 ! 23: 
 ! 24: struct _HTSGMLContext {
 ! 25:   SGML_dtd      *dtd;
 ! 26:   void        (*contents_treatment) PARAMS((void * data, char c));
 ! 27:   HTTag       *current_tag;
 ! 28:   attr        *current_attribute;
 ! 29:   HTChunk      *string;
 ! 30:   HTElement     *element_stack;
 ! 31:   enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap, 
 ! 32:        S_attr, S_attr_gap, S_equals, S_value,
 ! 33:        S_ero, S_cro,
 ! 34:         S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
 ! 35:   void *       callerData;
 ! 36: };
 ! 37: 
 ! 38: 
 ! 39: /*   Handle Attribute
 ! 40: **   ----------------
 ! 41: */
 ! 42: /* PUBLIC CONST char * SGML_default = "";  ?? */
 ! 43: 
 ! 44: #ifdef __STDC__
 ! 45: PRIVATE void handle_attribute_name(HTSGMLContext context, const char * s)
 ! 46: #else
 ! 47: PRIVATE void handle_attribute_name(context, s)
 ! 48:   HTSGMLContext context;
 ! 49:   char *s;
 ! 50: #endif
 ! 51: {
 ! 52:   attr* a;
 ! 53:   for(  a = context->current_tag->attributes;
 ! 54:      a->name;
 ! 55:      a++) {
 ! 56:    if (0==strcasecomp(a->name, s))
 ! 57:      break;
 ! 58:   }
 ! 59:   if (!a->name) {
 ! 60:    if (TRACE)
 ! 61:      fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
 ! 62:        s, context->current_tag->name);
 ! 63:     context->current_attribute = 0;    /* Invalid */
 ! 64:    return;
 ! 65:   }
 ! 66:   a->present = YES;
 ! 67:   if (a->value) {
 ! 68:     free(a->value);
 ! 69:    a->value = 0;
 ! 70:   }
 ! 71:   context->current_attribute = a;
 ! 72: }
 ! 73: 
 ! 74: 
 ! 75: /*   Handle attribute value
 ! 76: **   ----------------------
 ! 77: */
 ! 78: #ifdef __STDC__
 ! 79: PRIVATE void handle_attribute_value(HTSGMLContext context, const char * s)
 ! 80: #else
 ! 81: PRIVATE void handle_attribute_value(context, s)
 ! 82:   HTSGMLContext context;
 ! 83:   char *s;
 ! 84: #endif
 ! 85: {
 ! 86:   if (context->current_attribute) {
 ! 87:    StrAllocCopy(context->current_attribute->value, s);
 ! 88:   } else {
 ! 89:     if (TRACE) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
 ! 90:   }
 ! 91:   context->current_attribute = 0;  /* can't have two assignments! */
 ! 92: }
 ! 93: 
 ! 94: /*   Handle entity
 ! 95: **   -------------
 ! 96: **
 ! 97: ** On entry,
 ! 98: **   s    contains the entity name zero terminated
 ! 99: ** Bugs:
 ! 100: **   If the entity name is unknown, the terminator is treated as
 ! 101: **   a printable non-special character in all cases, even if it is '<'
 ! 102: */
 ! 103: #ifdef __STDC__
 ! 104: PRIVATE void handle_entity(HTSGMLContext context, char term)
 ! 105: #else
 ! 106: PRIVATE void handle_entity(context, term)
 ! 107:   HTSGMLContext context;
 ! 108:   char term;
 ! 109: #endif
 ! 110: {
 ! 111:   entity * e;
 ! 112:   entity * entities = context->dtd->entities;
 ! 113:   CONST char *s = context->string->data;
 ! 114: 
 ! 115:   for(e = entities; e->name; e++) {
 ! 116:    if (0==strcmp(e->name, s)) {
 ! 117:      char * p;
 ! 118:      for (p=e->representation; *p; p++) {
 ! 119:        (*context->contents_treatment)(context->callerData, *p);
 ! 120:      }
 ! 121:      return;   /* Good */
 ! 122:    }
 ! 123:   }
 ! 124:   /* If entity string not found, display as text */
 ! 125:   if (TRACE)
 ! 126:    fprintf(stderr, "SGML: Unknown entity %s\n", s); 
 ! 127:   (*context->contents_treatment)(context->callerData, '&');
 ! 128:   {
 ! 129:    CONST char *p;
 ! 130:    for (p=s; *p; p++) {
 ! 131:      (*context->contents_treatment)(context->callerData, *p);
 ! 132:    }
 ! 133:   }
 ! 134:   (*context->contents_treatment)(context->callerData, term);
 ! 135: }
 ! 136: 
 ! 137: /*   End element
 ! 138: */
 ! 139: #ifdef __STDC__
 ! 140: PRIVATE void end_element(HTSGMLContext context, HTTag * old_tag)
 ! 141: #else
 ! 142: PRIVATE void end_element(context, old_tag)
 ! 143:   HTTag * old_tag;
 ! 144:   HTSGMLContext context;
 ! 145: #endif
 ! 146: {
 ! 147:   if (TRACE) fprintf(stderr, "SGML: End  </%s>\n", old_tag->name);
 ! 148:   if (!old_tag->end) {
 ! 149:     if (TRACE) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
 ! 150:        old_tag->name);
 ! 151:    return;
 ! 152:   }
 ! 153:   while (context->element_stack)   {/* Loop is error path only */
 ! 154:    HTElement * N = context->element_stack;
 ! 155:    HTTag * t = N->tag;
 ! 156:    
 ! 157:    if (old_tag != t) {       /* Mismatch: syntax error */
 ! 158:      if (context->element_stack->next) { /* This is not the last level */
 ! 159:        if (TRACE) fprintf(stderr,
 ! 160:        "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
 ! 161:          old_tag->name, t->name, t->name);
 ! 162:      } else {          /* last level */
 ! 163:        if (TRACE) fprintf(stderr,
 ! 164:          "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
 ! 165:          old_tag->name, t->name, old_tag->name);
 ! 166:        return;         /* Ignore */
 ! 167:      }
 ! 168:    }
 ! 169:    
 ! 170:    context->element_stack = N->next;        /* Remove from stack */
 ! 171:    free(N);
 ! 172:    (t->end)(context->callerData,
 ! 173:         t,
 ! 174:         context->element_stack);    /* Assume tag end */
 ! 175:    if (context->element_stack)       /* not end of document */
 ! 176:      context->contents_treatment = context->element_stack->tag->treat;
 ! 177:    if (old_tag == t) return; /* Correct sequence */
 ! 178:    
 ! 179:    /* Syntax error path only */
 ! 180:    
 ! 181:   }
 ! 182:   fprintf(stderr,
 ! 183:    "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
 ! 184: }
 ! 185: 
 ! 186: 
 ! 187: /*   Start a element
 ! 188: */
 ! 189: #ifdef __STDC__
 ! 190: PRIVATE void start_element(HTSGMLContext context)
 ! 191: #else
 ! 192: PRIVATE void start_element(context)
 ! 193:   HTSGMLContext context;
 ! 194: #endif
 ! 195: {
 ! 196:   HTTag * new_tag = context->current_tag;
 ! 197:   
 ! 198:   if (TRACE) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
 ! 199:   (*new_tag->begin)(context->callerData, new_tag, context->element_stack);
 ! 200:   if (new_tag->end) {        /* i.e. tag not empty */
 ! 201:    HTElement * N = (HTElement *)malloc(sizeof(HTElement));
 ! 202:     if (N == NULL) outofmem(__FILE__, "start_element");
 ! 203:    N->next = context->element_stack;
 ! 204:    N->tag = new_tag;
 ! 205:    context->element_stack = N;
 ! 206:    context->contents_treatment = new_tag->treat;
 ! 207:   }
 ! 208: }
 ! 209: 
 ! 210: /*________________________________________________________________________
 ! 211: **           Public Methods
 ! 212: */
 ! 213: 
 ! 214: /*   Create SGML Engine
 ! 215: **   ------------------
 ! 216: **
 ! 217: ** On entry,
 ! 218: **   dtd->tags        represents the DTD, along with
 ! 219: **   dtd->entities
 ! 220: **
 ! 221: **   default_tag   represents the initial and final actions,
 ! 222: **           and the character processing, for data outside
 ! 223: **           any tags. May not be empty.
 ! 224: */
 ! 225: 
 ! 226: PUBLIC HTSGMLContext SGML_begin ARGS1(SGML_dtd *,dtd)
 ! 227: {
 ! 228:   HTSGMLContext context = (HTSGMLContext) malloc(sizeof(*context));
 ! 229:   if (!context) outofmem(__FILE__, "SGML_begin");
 ! 230: 
 ! 231:   context->string = HTChunkCreate(128);   /* Grow by this much */
 ! 232:   context->dtd = dtd;
 ! 233:   context->state = S_text;
 ! 234:   context->element_stack = 0;            /* empty */
 ! 235:   context->callerData = (void*) 0;      /* unspcified as yet */
 ! 236:   context->current_tag = dtd->default_tag;
 ! 237:   start_element(context);      /* Start document */
 ! 238:   return context;
 ! 239: }
 ! 240: 
 ! 241: 
 ! 242: PUBLIC void SGML_end ARGS1(HTSGMLContext, context)
 ! 243: {
 ! 244:   end_element(context, context->dtd->default_tag);  /* End document */
 ! 245:   HTChunkFree(context->string);
 ! 246:   free(context);
 ! 247: }
 ! 248: 
 ! 249: /*   Read and write user callback handle
 ! 250: **   -----------------------------------
 ! 251: **
 ! 252: **  The callbacks from the SGML parser have an SGML context parameter.
 ! 253: **  These calls allow the caller to associate his own context with a
 ! 254: **  particular SGML context.
 ! 255: */
 ! 256: 
 ! 257: PUBLIC void* SGML_callerData ARGS1(HTSGMLContext, context)
 ! 258: {
 ! 259:   return context->callerData;
 ! 260: }
 ! 261: 
 ! 262: PUBLIC void SGML_setCallerData ARGS2(HTSGMLContext, context, void*, data)
 ! 263: {
 ! 264:   context->callerData = data;
 ! 265: }
 ! 266: 
 ! 267: 
 ! 268: PUBLIC void SGML_string ARGS2(HTSGMLContext, context, char*, str)
 ! 269: {
 ! 270:   char *p;
 ! 271:   for(p=str; *p; p++)
 ! 272:     SGML_character(context, *p);
 ! 273: }
 ! 274: 
 ! 275: PUBLIC void SGML_character ARGS2(HTSGMLContext, context, char,c)
 ! 276: 
 ! 277: {
 ! 278:   SGML_dtd  *dtd  =    context->dtd;
 ! 279:   HTChunk  *string =    context->string;
 ! 280: 
 ! 281:   switch(context->state) {
 ! 282:   case S_text:
 ! 283:    if (c=='&' && !(context->element_stack &&
 ! 284:            context->element_stack->tag &&
 ! 285:            context->element_stack->tag->litteral)) {
 ! 286:      string->size = 0;
 ! 287:      context->state = S_ero;
 ! 288:      
 ! 289:    } else if (c=='<') {
 ! 290:      string->size = 0;
 ! 291:      context->state = (context->element_stack &&
 ! 292:            context->element_stack->tag &&
 ! 293:            context->element_stack->tag->litteral) ?
 ! 294:                S_litteral : S_tag;
 ! 295:    } else (*context->contents_treatment)(context->callerData, c);
 ! 296:    break;
 ! 297: 
 ! 298: /*   In litteral mode, waits only for specific end tag!
 ! 299: */
 ! 300:   case S_litteral :
 ! 301:    HTChunkPutc(string, c);
 ! 302:    if ( TOUPPER(c) != ((string->size ==1) ? '/'
 ! 303:        : context->element_stack->tag->name[string->size-2])) {
 ! 304:      int i;
 ! 305:      
 ! 306:      /* If complete match, end litteral */
 ! 307:      if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
 ! 308:        end_element(context, context->element_stack->tag);
 ! 309:        string->size = 0;
 ! 310:        context->current_attribute = (attr *) 0;
 ! 311:        context->state = S_text;
 ! 312:        break;
 ! 313:      }      /* If Mismatch: recover string. */
 ! 314:      (*context->contents_treatment)(context->callerData, '<');
 ! 315:      for (i=0; i<string->size; i++)   /* recover */
 ! 316:        (*context->contents_treatment)(context->callerData,
 ! 317:                       string->data[i]);
 ! 318:      context->state = S_text;  
 ! 319:    }
 ! 320:    
 ! 321:     break;
 ! 322: 
 ! 323: /*   Character reference or Entity
 ! 324: */
 ! 325:  case S_ero:
 ! 326:    if (c=='#') {
 ! 327:      context->state = S_cro; /*  &# is Char Ref Open */ 
 ! 328:      break;
 ! 329:    }
 ! 330:    context->state = S_entity;  /* Fall through! */
 ! 331:    
 ! 332: /*   Handle Entities
 ! 333: */
 ! 334:   case S_entity:
 ! 335:    if (isalnum(c))
 ! 336:      HTChunkPutc(string, c);
 ! 337:    else {
 ! 338:      HTChunkTerminate(string);
 ! 339:      handle_entity(context, c);
 ! 340:      context->state = S_text;
 ! 341:    }
 ! 342:    break;
 ! 343: 
 ! 344: /*   Character reference
 ! 345: */
 ! 346:   case S_cro:
 ! 347:    if (isalnum(c))
 ! 348:      HTChunkPutc(string, c);   /* accumulate a character NUMBER */
 ! 349:    else {
 ! 350:      int value;
 ! 351:      HTChunkTerminate(string);
 ! 352:      if (sscanf(string->data, "%d", &value)==1)
 ! 353:        (*context->contents_treatment)(context->callerData,
 ! 354:                        FROMASCII((char)value));
 ! 355:      context->state = S_text;
 ! 356:    }
 ! 357:    break;
 ! 358: 
 ! 359: /*       Tag
 ! 360: */     
 ! 361:   case S_tag:                /* new tag */
 ! 362:    if (isalnum(c))
 ! 363:      HTChunkPutc(string, c);
 ! 364:    else {             /* End of tag name */
 ! 365:      attr * a;
 ! 366:      if (c=='/') {
 ! 367:        if (TRACE) if (string->size!=0)
 ! 368:          fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
 ! 369:        context->state = S_end;
 ! 370:        break;
 ! 371:      }
 ! 372:      HTChunkTerminate(string) ;
 ! 373:      for(context->current_tag = dtd->tags;
 ! 374:        context->current_tag->name; context->current_tag++) {
 ! 375:        if (0==strcasecomp(context->current_tag->name, string->data)) {
 ! 376:          break;
 ! 377:        }
 ! 378:      }
 ! 379:      if (!context->current_tag->name) {
 ! 380:        if(TRACE) fprintf(stderr, "Unknown tag %s\n",
 ! 381:            string->data);
 ! 382:        context->state = (c=='>') ? S_text : S_junk_tag;
 ! 383:        break;
 ! 384:      }
 ! 385:      
 ! 386:      for (a = context->current_tag->attributes; a->name; a++ ) {
 ! 387:        a->present = NO;
 ! 388:      }
 ! 389:      string->size = 0;
 ! 390:      context->current_attribute = (attr *) 0;
 ! 391:      
 ! 392:      if (c=='>') {
 ! 393:        if (context->current_tag->name) start_element(context);
 ! 394:        context->state = S_text;
 ! 395:      } else {
 ! 396:        context->state = S_tag_gap;
 ! 397:      }
 ! 398:    }
 ! 399:    break;
 ! 400: 
 ! 401:        
 ! 402:   case S_tag_gap:      /* Expecting attribute or > */
 ! 403:    if (WHITE(c)) break;  /* Gap between attributes */
 ! 404:    if (c=='>') {      /* End of tag */
 ! 405:      if (context->current_tag->name) start_element(context);
 ! 406:      context->state = S_text;
 ! 407:      break;
 ! 408:    }
 ! 409:    HTChunkPutc(string, c);
 ! 410:    context->state = S_attr;        /* Get attribute */
 ! 411:    break;
 ! 412:    
 ! 413:                /* accumulating value */
 ! 414:   case S_attr:
 ! 415:    if (WHITE(c) || (c=='>') || (c=='=')) {     /* End of word */
 ! 416:      HTChunkTerminate(string) ;
 ! 417:      handle_attribute_name(context, string->data);
 ! 418:      string->size = 0;
 ! 419:      if (c=='>') {        /* End of tag */
 ! 420:        if (context->current_tag->name) start_element(context);
 ! 421:        context->state = S_text;
 ! 422:        break;
 ! 423:      }
 ! 424:      context->state = (c=='=' ? S_equals: S_attr_gap);
 ! 425:    } else {
 ! 426:      HTChunkPutc(string, c);
 ! 427:    }
 ! 428:    break;
 ! 429:        
 ! 430:   case S_attr_gap:      /* Expecting attribute or = or > */
 ! 431:    if (WHITE(c)) break;  /* Gap after attribute */
 ! 432:    if (c=='>') {      /* End of tag */
 ! 433:      if (context->current_tag->name) start_element(context);
 ! 434:      context->state = S_text;
 ! 435:      break;
 ! 436:    } else if (c=='=') {
 ! 437:      context->state = S_equals;
 ! 438:      break;
 ! 439:    }
 ! 440:    HTChunkPutc(string, c);
 ! 441:    context->state = S_attr;        /* Get next attribute */
 ! 442:    break;
 ! 443:    
 ! 444:   case S_equals:           /* After attr = */ 
 ! 445:    if (WHITE(c)) break;  /* Before attribute value */
 ! 446:    if (c=='>') {      /* End of tag */
 ! 447:      fprintf(stderr, "SGML: found = but no value\n");
 ! 448:      if (context->current_tag->name) start_element(context);
 ! 449:      context->state = S_text;
 ! 450:      break;
 ! 451:      
 ! 452:    } else if (c=='\'') {
 ! 453:      context->state = S_squoted;
 ! 454:      break;
 ! 455: 
 ! 456:    } else if (c=='"') {
 ! 457:      context->state = S_dquoted;
 ! 458:      break;
 ! 459:    }
 ! 460:    HTChunkPutc(string, c);
 ! 461:    context->state = S_value;
 ! 462:    break;
 ! 463:    
 ! 464:   case S_value:
 ! 465:    if (WHITE(c) || (c=='>')) {       /* End of word */
 ! 466:      HTChunkTerminate(string) ;
 ! 467:      handle_attribute_value(context, string->data);
 ! 468:      string->size = 0;
 ! 469:      if (c=='>') {        /* End of tag */
 ! 470:        if (context->current_tag->name) start_element(context);
 ! 471:        context->state = S_text;
 ! 472:        break;
 ! 473:      }
 ! 474:      else context->state = S_tag_gap;
 ! 475:    } else {
 ! 476:      HTChunkPutc(string, c);
 ! 477:    }
 ! 478:    break;
 ! 479:        
 ! 480:   case S_squoted:      /* Quoted attribute value */
 ! 481:    if (c=='\'') {     /* End of attribute value */
 ! 482:      HTChunkTerminate(string) ;
 ! 483:      handle_attribute_value(context, string->data);
 ! 484:      string->size = 0;
 ! 485:      context->state = S_tag_gap;
 ! 486:    } else {
 ! 487:      HTChunkPutc(string, c);
 ! 488:    }
 ! 489:    break;
 ! 490:    
 ! 491:   case S_dquoted:      /* Quoted attribute value */
 ! 492:    if (c=='"') {      /* End of attribute value */
 ! 493:      HTChunkTerminate(string) ;
 ! 494:      handle_attribute_value(context, string->data);
 ! 495:      string->size = 0;
 ! 496:      context->state = S_tag_gap;
 ! 497:    } else {
 ! 498:      HTChunkPutc(string, c);
 ! 499:    }
 ! 500:    break;
 ! 501:    
 ! 502:   case S_end:                    /* </ */
 ! 503:    if (isalnum(c))
 ! 504:      HTChunkPutc(string, c);
 ! 505:    else {             /* End of end tag name */
 ! 506:      HTChunkTerminate(string) ;
 ! 507:      if (c!='>') {
 ! 508:        if (TRACE) fprintf(stderr,"SGML: `</%s%c' found!\n",
 ! 509:          string->data, c);
 ! 510:        context->state = S_junk_tag;
 ! 511:        break;
 ! 512:      }
 ! 513:      for(context->current_tag = dtd->tags;
 ! 514:        context->current_tag->name; context->current_tag++) {
 ! 515:        if (0==strcasecomp(context->current_tag->name, string->data)) {
 ! 516:          end_element( context, context->current_tag);
 ! 517:          break;
 ! 518:        }
 ! 519:      }
 ! 520:      if (!context->current_tag->name) {
 ! 521:        if(TRACE) fprintf(stderr,
 ! 522:          "Unknown end tag </%s>\n", string->data); 
 ! 523:      }
 ! 524:      string->size = 0;
 ! 525:      context->current_attribute = (attr *) 0;
 ! 526:      context->state = S_text;
 ! 527:    }
 ! 528:    break;
 ! 529: 
 ! 530:        
 ! 531:   case S_junk_tag:
 ! 532:    if (c=='>') {
 ! 533:      context->state = S_text;
 ! 534:    }
 ! 535:    
 ! 536:   } /* switch on context->state */
 ! 537: 
 ! 538: } /* SGML_character */

Webmaster

AltStyle によって変換されたページ (->オリジナル) /