[BACK] Return to HTML.c CVS log [TXT] [DIR] Up to [Public] / libwww / Library / src

Annotation of libwww/Library/src/HTML.c, revision 1.1.1.1

1.1 timbl 1: /*       HTML Parser
 2: **       ===========
 3: **
 4: ** An HTML displayable object has associated with it
 5: **
 6: **       - The underlying text object for display
 7: **       - An SGML parsing context
 8: **       - An anchor representing the whole object
 9: **       - A style sheet, in the case os a style-oriented version
 10: **
 11: ** The first three could logically be represented by multiple inheritance if
 12: ** that were supported, as an HTML object is like a subclass of all three.
 13: **
 14: ** In practice in C,
 15: **
 16: **   - a HText object is created by this module (when needed)
 17: **   - an SGML parsing object is created by this module
 18: **   - the anchor representing the object is given at creation time
 19: **
 20: ** Those using structured HTML objects will wish to override this module
 21: ** completely
 22: */
 23: #include "HTML.h"
 24: 
 25: #include <ctype.h>
 26: #include <stdio.h>
 27: 
 28: #include "HTAtom.h"
 29: #include "HTChunk.h"
 30: #include "HText.h"
 31: #include "HTStyle.h"
 32: 
 33: 
 34: /*               SPECIAL HTML CODE
 35: **               =================
 36: */
 37: 
 38: extern HTStyleSheet * styleSheet;   /* Application-wide */
 39: 
 40: /*   Module-wide style cache
 41: */
 42: PRIVATE HTStyle * glossary_style;
 43: PRIVATE HTStyle * list_compact_style;
 44: PRIVATE HTStyle * glossary_compact_style;
 45: PRIVATE int      got_styles = 0;
 46: 
 47: 
 48: /*       HTML Object
 49: **       -----------
 50: */
 51: struct _HTML {
 52:   HTParentAnchor *  node_anchor;
 53:   HText *      text;
 54:   HTSGMLContext   context;
 55: 
 56:   HTChunk title;   /* Grow by 128 */
 57: 
 58: /* Used in parsing: */
 59: 
 60:   BOOL    style_change;
 61:   HTStyle * new_style;
 62:   HTStyle * old_style;
 63:   BOOL    in_word; /* Have just had a non-white character */
 64: };
 65: 
 66: 
 67: /*       Forward declarations of routines
 68: */
 69: PRIVATE void get_styles NOPARAMS;
 70: 
 71: /* For dtd: */
 72: PRIVATE void no_change PARAMS((void*this, HTTag * t, HTElement * e));
 73: PRIVATE void begin_litteral PARAMS((void*this, HTTag * t, HTElement * e));
 74: PRIVATE void begin_element PARAMS((void*this, HTTag * t, HTElement * e));
 75: PRIVATE void end_element PARAMS((void*this, HTTag * t, HTElement * e));
 76: PRIVATE void begin_document PARAMS((void*this, HTTag * t, HTElement * e));
 77: PRIVATE void end_document PARAMS((void*this, HTTag * t, HTElement * e));
 78: PRIVATE void begin_anchor PARAMS((void*this, HTTag * t, HTElement * e));
 79: PRIVATE void end_anchor PARAMS((void*this, HTTag * t, HTElement * e));
 80: PRIVATE void begin_list PARAMS((void*this, HTTag * t, HTElement * e));
 81: PRIVATE void list_element PARAMS((void*this, HTTag * t, HTElement * e));
 82: PRIVATE void end_list PARAMS((void*this, HTTag * t, HTElement * e));
 83: PRIVATE void begin_glossary PARAMS((void*this, HTTag * t, HTElement * e));
 84: PRIVATE void end_glossary PARAMS((void*this, HTTag * t, HTElement * e));
 85: 
 86: PRIVATE void actually_set_style PARAMS((HTML_id this));
 87: PRIVATE void change_style PARAMS((HTML_id this, HTStyle * style));
 88: 
 89: /*   Style buffering avoids dummy paragraph begin/ends.
 90: */
 91: #define UPDATE_STYLE if (THIS->style_change) { actually_set_style(THIS); }
 92: 
 93: #define THIS ((HTML_id)this)
 94: 
 95: /*   Things affecting the anchor but not the document itself
 96: **   -------------------------------------------------------
 97: */
 98: 
 99: 
 100: /*       TITLE
 101: */
 102: 
 103: /*   Accumulate a character of title
 104: */
 105: static void accumulate_string ARGS2(void *, this, char, c)
 106: 
 107: {
 108:   HTChunkPutc(&THIS->title, c);
 109: }
 110: 
 111: 
 112: /*       Clear the title
 113: */
 114: PRIVATE void clear_string ARGS3(void *, this, HTTag *,t, HTElement *,e)
 115: {
 116:   HTChunkClear(&THIS->title);
 117: }
 118: 
 119: PRIVATE void set_title ARGS3(void *, this, HTTag *,t, HTElement *,e)
 120: {
 121:   HTChunkTerminate(&THIS->title);
 122:   HTAnchor_setTitle(THIS->node_anchor, THIS->title.data);
 123: }
 124: 
 125: PRIVATE void set_index ARGS3(void *, this, HTTag *,t, HTElement *,e)
 126: {
 127:   HTAnchor_setIndex(THIS->node_anchor);
 128: }
 129: 
 130: /*           Things affecting the document
 131: **           -----------------------------
 132: */
 133: /*       Character handling
 134: */
 135: PRIVATE void pass_character ARGS2(void *, this, char, c)
 136: {
 137:   if (THIS->style_change) {
 138:     if ((c=='\n') || (c==' ')) return;   /* Ignore it */
 139:     UPDATE_STYLE;
 140:   }
 141:   if (c=='\n') {
 142:     if (THIS->in_word) {
 143:      HText_appendCharacter(THIS->text, ' ');
 144:      THIS->in_word = NO;
 145:    }
 146:   } else {
 147:     HText_appendCharacter(THIS->text, c);
 148:    THIS->in_word = YES;
 149:   }
 150: }
 151: 
 152: PRIVATE void litteral_text ARGS2(void *, this, char, c)
 153: {
 154: /*   We guarrantee that the style is up-to-date in begin_litteral
 155: */
 156:   HText_appendCharacter(THIS->text, c);       /* @@@@@ */
 157: }
 158: 
 159: PRIVATE void ignore_text ARGS2(void *, this, char, c)
 160: {
 161:   /* Do nothing */
 162: }
 163: 
 164: PRIVATE void set_next_id ARGS3(void *, this, HTTag *,t, HTElement *,e)
 165: {
 166:   /* Not needed */
 167: }
 168: 
 169: PRIVATE void new_paragraph ARGS3(void *, this, HTTag *,t, HTElement *,e)
 170: {
 171:   UPDATE_STYLE;
 172:   HText_appendParagraph(THIS->text);
 173:   THIS->in_word = NO;
 174: }
 175: 
 176: PRIVATE void term ARGS3(void *, this, HTTag *,t, HTElement *,e)
 177: {
 178:   if (!THIS->style_change) {
 179:     HText_appendParagraph(THIS->text);
 180:    THIS->in_word = NO;
 181:   }
 182: }
 183: 
 184: PRIVATE void definition ARGS3(void *, this, HTTag *,t, HTElement *,e)
 185: {
 186:   UPDATE_STYLE;
 187:   pass_character(this, '\t');    /* Just tab out one stop */
 188:   THIS->in_word = NO;
 189: }
 190: 
 191: /*       Our Static DTD for HTML
 192: **       -----------------------
 193: */
 194: 
 195: static entity entities[] = {
 196:    { "lt", "<" },
 197:    { "gt", ">" },
 198:    { "amp", "&" },
 199: #ifdef NeXT
 200:    { "bullet" , "267円" },         /* @@@ NeXT only */
 201: #endif
 202: /* The following accented characters are from peter Flynn, curia project */
 203: 
 204: /* these ifdefs don't solve the problem of a simple terminal emulator
 205: ** with a different character set to the client machine. But nothing does,
 206: ** except looking at the TERM setting */
 207: 
 208:     { "ocus" , "&" },    /* for CURIA */
 209: #ifdef IBMPC
 210:     { "aacute" , "240円" }, /* For PC display */
 211:     { "eacute" , "202円" },
 212:     { "iacute" , "241円" },
 213:     { "oacute" , "242円" },
 214:     { "uacute" , "243円" },
 215:     { "Aacute" , "101円" },
 216:     { "Eacute" , "220円" },
 217:     { "Iacute" , "111円" },
 218:     { "Oacute" , "117円" },
 219:     { "Uacute" , "125円" },
 220: #else
 221:     { "aacute" , "341円" }, /* Works for openwindows -- Peter Flynn */
 222:     { "eacute" , "351円" },
 223:     { "iacute" , "355円" },
 224:     { "oacute" , "363円" },
 225:     { "uacute" , "372円" },
 226:     { "Aacute" , "301円" },
 227:     { "Eacute" , "310円" },
 228:     { "Iacute" , "315円" },
 229:     { "Oacute" , "323円" },
 230:     { "Uacute" , "332円" }, 
 231: #endif
 232:    { 0,  0 } /* Terminate list */
 233: };
 234: 
 235: static attr no_attr[] = {{ 0, 0 , 0}};
 236: 
 237: static attr a_attr[] = {                /* Anchor attributes */
 238: #define A_ID 0
 239:    { "NAME", 0, 0 },                /* Should be ID */
 240: #define A_TYPE 1
 241:    { "TYPE", 0, 0 },
 242: #define A_HREF 2
 243:    { "HREF", 0, 0 },
 244:    { 0, 0 , 0}   /* Terminate list */
 245: };   
 246: static attr list_attr[] = {
 247: #define LIST_COMPACT 0
 248:    { "COMPACT", 0, 0 },
 249:    { 0, 0, 0 }   /* Terminate list */
 250: };
 251: 
 252: static attr glossary_attr[] = {
 253: #define GLOSSARY_COMPACT 0
 254:    { "COMPACT", 0, 0 },
 255:    { 0, 0, 0 }   /* Terminate list */
 256: };
 257: 
 258: static HTTag default_tag =
 259:   { "DOCUMENT", no_attr , 0, 0, begin_document, pass_character, end_document };
 260: /*   NAME ATTR STYLE LITERAL? ON_BEGIN  ON__CHARACTER   ON_END
 261: */
 262: static HTTag tags[] = {
 263: #define TITLE_TAG 0
 264:   { "TITLE", no_attr, 0, 0, clear_string, accumulate_string, set_title },
 265: #define ISINDEX_TAG 1
 266:   { "ISINDEX", no_attr, 0, 0, set_index, 0 , 0 },
 267: #define NEXTID_TAG 2
 268:   { "NEXTID", no_attr, 0, 0, set_next_id, 0, 0 },
 269: #define ADDRESS_TAG 3
 270:   { "ADDRESS"    , no_attr, 0, 0, begin_element, pass_character, end_element },
 271: #define H1_TAG 4
 272:   { "H1"   , no_attr, 0, 0, begin_element, pass_character, end_element },
 273:   { "H2"   , no_attr, 0, 0, begin_element, pass_character, end_element },
 274:   { "H3"   , no_attr, 0, 0, begin_element, pass_character, end_element },
 275:   { "H4"   , no_attr, 0, 0, begin_element, pass_character, end_element },
 276:   { "H5"   , no_attr, 0, 0, begin_element, pass_character, end_element },
 277:   { "H6"   , no_attr, 0, 0, begin_element, pass_character, end_element },
 278:   { "H7"   , no_attr, 0, 0, begin_element, pass_character, end_element },
 279: #define UL_TAG 11
 280:   { "UL"   , list_attr, 0, 0, begin_list, pass_character, end_list },
 281: #define OL_TAG 12
 282:   { "OL"   , list_attr, 0, 0, begin_list, pass_character, end_list },
 283: #define MENU_TAG 13
 284:   { "MENU"  , list_attr, 0, 0, begin_list, pass_character, end_list },
 285: #define DIR_TAG 14
 286:   { "DIR"  , list_attr, 0, 0, begin_list, pass_character, end_list },
 287: #define LI_TAG 15
 288:   { "LI"   , list_attr, 0, 0, list_element, pass_character, 0 },
 289: #define DL_TAG 16
 290:   { "DL"   , glossary_attr, 0, 0, begin_glossary, pass_character, end_glossary },
 291:   { "DT"   , no_attr, 0, 0, term, pass_character, 0 },
 292:   { "DD"   , no_attr, 0, 0, definition, pass_character, 0 },
 293:   { "A"   , a_attr, 0, 0, begin_anchor, pass_character, end_anchor },
 294: #define P_TAG 20
 295:   { "P"   , no_attr, 0, 0, new_paragraph, pass_character, 0 },
 296: #define XMP_TAG 21
 297:  { "XMP"   , no_attr, 0, YES, begin_litteral, litteral_text, end_element },
 298: #define PRE_TAG 22
 299:  { "PRE"   , no_attr, 0, 0, begin_litteral, litteral_text, end_element },
 300: #define LISTING_TAG 23
 301:  { "LISTING" , no_attr, 0, YES,begin_litteral, litteral_text, end_element },
 302: #define PLAINTEXT_TAG 24
 303:  { "PLAINTEXT", no_attr, 0, YES, begin_litteral, litteral_text, end_element },
 304: #define COMMENT_TAG 25
 305:   { "COMMENT", no_attr, 0, YES, no_change, ignore_text, no_change },
 306:   { 0, 0, 0, 0, 0, 0 , 0}  /* Terminate list */
 307: };
 308: 
 309: PUBLIC SGML_dtd HTML_dtd = { tags, &default_tag, entities };
 310: 
 311: 
 312: /*       Flattening the style structure
 313: **       ------------------------------
 314: **
 315: On the NeXT, and on any read-only browser, it is simpler for the text to have
 316: a sequence of styles, rather than a nested tree of styles. In this
 317: case we have to flatten the structure as it arrives from SGML tags into
 318: a sequence of styles.
 319: */
 320: 
 321: /*       If style really needs to be set, call this
 322: */
 323: PRIVATE void actually_set_style ARGS1(HTML_id, this)
 324: {
 325:   if (!THIS->text) {         /* First time through */
 326:      THIS->text = HText_new(THIS->node_anchor);
 327:      HText_beginAppend(THIS->text);
 328:      HText_setStyle(THIS->text, THIS->new_style);
 329:      THIS->in_word = NO;
 330:   } else {
 331:      HText_setStyle(THIS->text, THIS->new_style);
 332:   }
 333:   THIS->old_style = THIS->new_style;
 334:   THIS->style_change = NO;
 335: }
 336: 
 337: /*   If you THINK you need to change style, call this
 338: */
 339: 
 340: PRIVATE void change_style ARGS2(HTML_id, this, HTStyle *,style)
 341: {
 342:   if (THIS->new_style!=style) {
 343:    THIS->style_change = YES /* was old_style == new_style */ ;
 344:    THIS->new_style = style;
 345:   }
 346: }
 347: 
 348: /*   Anchor handling
 349: **   ---------------
 350: */
 351: PRIVATE void begin_anchor ARGS3(void *, this, HTTag *,t, HTElement *,e)
 352: {
 353:   HTChildAnchor * source = HTAnchor_findChildAndLink(
 354:    THIS->node_anchor,                       /* parent */
 355:    a_attr[A_ID].present  ? a_attr[A_ID].value : 0,    /* Tag */
 356:    a_attr[A_HREF].present ? a_attr[A_HREF].value : 0,   /* Addresss */
 357:    a_attr[A_TYPE].present ? 
 358:        (HTLinkType*)HTAtom_for(a_attr[A_TYPE].value)
 359:         : 0);
 360:   
 361:   UPDATE_STYLE;
 362:   HText_beginAnchor(THIS->text, source);
 363: }
 364: 
 365: PRIVATE void end_anchor ARGS3(void *, this, HTTag *,  t,
 366:            HTElement *,  e)
 367: {
 368:   UPDATE_STYLE;
 369:   HText_endAnchor(THIS->text);
 370: }
 371: 
 372: 
 373: /*   General SGML Element Handling
 374: **   -----------------------------
 375: */
 376: PRIVATE void begin_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
 377: {
 378:   change_style(THIS, (HTStyle*)(t->style));
 379: }
 380: PRIVATE void no_change ARGS3(void *, this, HTTag *,t, HTElement *,e)
 381: {
 382:   /* Do nothing */;
 383: }
 384: PRIVATE void begin_litteral ARGS3(void *, this, HTTag *,t, HTElement *,e)
 385: {
 386:   change_style(THIS, t->style);
 387:   UPDATE_STYLE;
 388: }
 389: /*       End Element
 390: **
 391: **   When we end an element, the style must be returned to that
 392: **   in effect before that element. Note that anchors (etc?)
 393: **   don't have an associated style, so that we must scan down the
 394: **   stack for an element with a defined style. (In fact, the styles
 395: **   should be linked to the whole stack not just the top one.)
 396: **   TBL 921119
 397: */
 398: PRIVATE void end_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
 399: {
 400: /*  if (e) change_style(THIS, e->tag->style); */
 401:   while (e) {
 402:    if (e->tag->style) {
 403:        change_style(THIS, e->tag->style);
 404:        return;
 405:    }
 406:    e = e->next;
 407:   }
 408: }
 409: 
 410: /*           Lists
 411: */
 412: PRIVATE void begin_list ARGS3(void *, this, HTTag *,t, HTElement *,e)
 413: {
 414:   change_style(THIS, list_attr[LIST_COMPACT].present
 415:        ? list_compact_style
 416:        : (HTStyle*)(t->style));
 417:   THIS->in_word = NO;
 418: }
 419: 
 420: PRIVATE void end_list ARGS3(void *, this, HTTag *,t, HTElement *,e)
 421: {
 422:   change_style(THIS, e->tag->style);
 423:   THIS->in_word = NO;
 424: }
 425: 
 426: PRIVATE void list_element ARGS3(void *, this, HTTag *,t, HTElement *,e)
 427: {
 428:   UPDATE_STYLE;
 429:   if (e->tag != &tags[DIR_TAG])
 430:    HText_appendParagraph(THIS->text);
 431:   else
 432:     HText_appendCharacter(THIS->text, '\t');    /* Tab @@ nl for UL? */
 433:   THIS->in_word = NO;
 434: }
 435: 
 436: 
 437: PRIVATE void begin_glossary ARGS3(void *, this, HTTag *,t, HTElement *,e)
 438: {
 439:   change_style(THIS, glossary_attr[GLOSSARY_COMPACT].present
 440:        ? glossary_compact_style
 441:        : glossary_style);
 442:   THIS->in_word = NO;
 443: }
 444: 
 445: PRIVATE void end_glossary ARGS3(void *, this, HTTag *,t, HTElement *,e)
 446: {
 447:   change_style(THIS, e->tag->style);
 448:   THIS->in_word = NO;
 449: }
 450: 
 451: 
 452: /*   Create an HTML object
 453: **   ---------------------
 454: */
 455: PUBLIC HTML_id HTML_new ARGS1(HTParentAnchor *,anchor)
 456: {
 457: 
 458:   HTML_id this = malloc(sizeof(*this));
 459: 
 460:   if (!got_styles) get_styles();
 461: 
 462:   this->node_anchor = anchor;
 463:   this->title.size = 0;
 464:   this->title.growby = 128;
 465:   this->title.allocated = 0;
 466:   this->title.data = 0;
 467:   this->text = 0;
 468:   this->style_change = YES; /* Force check leading to text creation */
 469:   this->new_style = this->old_style = 0;
 470:   
 471:   this->context = SGML_begin(&HTML_dtd);
 472:   SGML_setCallerData(this->context, this);
 473:   
 474:   return this;
 475: }
 476: 
 477: 
 478: /*   Free an HTML object
 479: **   -------------------
 480: **
 481: **   Note that the SGML parsing context is freed, but the created object is not,
 482: **   as it takes on an existence of its own unless explicitly freed.
 483: */
 484: PUBLIC void HTML_free ARGS1(HTML_id, this)
 485: {
 486:   SGML_end(this->context);
 487:   free(this);
 488: }
 489: 
 490: PUBLIC HTSGMLContext HTML_SGMLContext ARGS1(HTML_id, this)
 491: {
 492:   return this->context;
 493: }
 494: 
 495: PRIVATE void begin_document ARGS3(void *, this, HTTag *, t, HTElement *, e)
 496: {
 497:   /* Can't do much, THIS is undefined here */
 498: }
 499: 
 500: PRIVATE void end_document ARGS3(void *, this, HTTag *, t, HTElement *, e)
 501: /* If the document is empty, the text object will not yet exist.
 502:  So we could in fact abandon creating the document and return
 503:  an error code. In fact an empty document is an important type
 504:  of document, so we don't.
 505: */
 506: {
 507:   UPDATE_STYLE;       /* Create empty document here! */
 508:   HText_endAppend(THIS->text);
 509: 
 510: }
 511: 
 512: /*   Get Styles from style sheet
 513: **   ---------------------------
 514: */
 515: PRIVATE void get_styles NOARGS
 516: {
 517:   got_styles = YES;
 518:   
 519:   tags[P_TAG].style =
 520:   default_tag.style =        HTStyleNamed(styleSheet, "Normal");
 521:   tags[H1_TAG].style =    HTStyleNamed(styleSheet, "Heading1");
 522:   tags[H1_TAG+1].style =   HTStyleNamed(styleSheet, "Heading2");
 523:   tags[H1_TAG+2].style =   HTStyleNamed(styleSheet, "Heading3");
 524:   tags[H1_TAG+3].style =   HTStyleNamed(styleSheet, "Heading4");
 525:   tags[H1_TAG+4].style =   HTStyleNamed(styleSheet, "Heading5");
 526:   tags[H1_TAG+5].style =   HTStyleNamed(styleSheet, "Heading6");
 527:   tags[H1_TAG+6].style =   HTStyleNamed(styleSheet, "Heading7");
 528:   tags[DL_TAG].style =    HTStyleNamed(styleSheet, "Glossary");
 529:   tags[UL_TAG].style =    HTStyleNamed(styleSheet, "List");
 530:   tags[OL_TAG].style =    HTStyleNamed(styleSheet, "List");
 531:   tags[MENU_TAG].style =   HTStyleNamed(styleSheet, "Menu");
 532:   list_compact_style =
 533:   tags[DIR_TAG].style =   HTStyleNamed(styleSheet, "Dir");  
 534:   glossary_style =      HTStyleNamed(styleSheet, "Glossary");
 535:   glossary_compact_style =  HTStyleNamed(styleSheet, "GlossaryCompact");
 536:   tags[ADDRESS_TAG].style=  HTStyleNamed(styleSheet, "Address");
 537:   tags[PLAINTEXT_TAG].style =
 538:   tags[XMP_TAG].style =   HTStyleNamed(styleSheet, "Example");
 539:   tags[PRE_TAG].style =   HTStyleNamed(styleSheet, "Preformatted");
 540:   tags[LISTING_TAG].style = HTStyleNamed(styleSheet, "Listing");
 541: }
 542: 
 543: 
 544: /*   Parse an HTML file
 545: **   ------------------
 546: **
 547: **   This version takes a pointer to the routine to call
 548: **   to get each character.
 549: */
 550: BOOL HTML_Parse
 551: #ifdef __STDC__
 552:  (HTParentAnchor * anchor, char (*next_char)() )
 553: #else
 554:  (anchor, next_char)
 555:   HTParentAnchor * anchor;
 556:   char (*next_char)();
 557: #endif
 558: {
 559:    HTSGMLContext context;
 560:     HTML_id this = HTML_new(anchor);
 561:    context = SGML_begin(&HTML_dtd);
 562:    SGML_setCallerData(context, this);
 563:    for(;;) {
 564:      char character;
 565:      character = (*next_char)();
 566:      if (character == (char)EOF) break;
 567:   
 568:      SGML_character(context, character);      
 569:     }
 570:    SGML_end(context);
 571:    free(this);
 572:    return YES;
 573: }

Webmaster

AltStyle によって変換されたページ (->オリジナル) /