# This is a shell archive. Remove anything before this line, # then unpack it by saving it in a file and typing "sh file". # # Wrapped by imagine!connolly on Mon Nov 23 07:58:04 CST 1992 # Contents: Makefile SGMLmain.c SGMLstream.c HTMLentities.c test.html # SGMLstream.h c_dialect.h HTML_SGML_decl.h HTMLentities.h echo x - Makefile sed 's/^@//' > "Makefile" <<'@//E*O*F Makefile//' # Makefile for HTML library # RANLIB = ranlib AR = ar cq HTMLLIB = libHTML.a PROG=sgml_test PROGOBJ = SGMLmain.o HTMLLIBOBJ = SGMLstream.o HTMLentities.o SRCS = SGMLmain.c \ SGMLstream.c \ HTMLentities.c HDRS = SGMLstream.h c_dialect.h HTML_SGML_decl.h HTMLentities.h DEBUG = -g DEFINES = -DSGML_DECLARATION=\"HTML_SGML_decl.h\" CFLAGS = $(DEBUG) $(DEFINES) $(HTMLLIB): $(HTMLLIBOBJ) rm -rf $(HTMLLIB) ar cq $(HTMLLIB) $(HTMLLIBOBJ) $(RANLIB) $(HTMLLIB) $(PROG): $(PROGOBJ) $(HTMLLIB) $(CC) -o $(PROG) $(CFLAGS) $(PROGOBJ) $(HTMLLIB) TESTFILE = test.html test: $(PROG) $(PROG) < $(TESTFILE) depend: makedepend $(CFLAGS) $(SRCS) shar: libHTML.shar libHTML.shar: shar Makefile $(SRCS) test.html $(HDRS) >$@ # DO NOT DELETE THIS LINE -- make depend depends on it. SGMLmain.o: SGMLstream.h c_dialect.h HTML_SGML_decl.h HTMLentities.h SGMLmain.o: SGMLstream.h /usr/include/stdio.h /usr/include/string.h SGMLstream.o: SGMLstream.h /usr/include/ctype.h /usr/include/assert.h HTMLentities.o: HTMLentities.h SGMLstream.h /usr/include/string.h @//E*O*F Makefile// chmod u=rw,g=rw,o=r Makefile echo x - SGMLmain.c sed 's/^@//' > "SGMLmain.c" <<'@//E*O*F SGMLmain.c//' /* SGMLmain.c -- test driver for SGML io routines * $Id$ */ #include "SGMLstream.h" #include "HTMLentities.h" #include #include main() { int read; char buffer[72]; int content = SGML_PCDATA; int lookahead = EOF; char name[SGML_NAMELEN+1]; char name_chars; char value[SGML_LITLEN+1]; while((read = SGML_read((SGML_Object)stdin, (SGML_Method)getc, buffer, sizeof(buffer), (SGML_Object)HTML_entities, HTML_expand_entity, sizeof(char), content, &lookahead)) != EOF){ switch(read){ case SGML_end_tag: case SGML_start_tag: name_chars = SGML_read_name((SGML_Object)stdin, (SGML_Method)getc, name, &lookahead); name[name_chars] = '\0'; if(read == SGML_end_tag){ printf("\n", name); content = SGML_PCDATA; }else{ printf("<%s", name); /* * certain tags change parsing mode * @@ this should be table-driven */ if(!strcmp(name, "XMP") || !strcmp(name, "LISTING")){ content = SGML_RCDATA; } while(isalpha(lookahead)){ /* iterate over attributes */ name_chars = SGML_read_name((SGML_Object)stdin, (SGML_Method)getc, name, &lookahead); name[name_chars] = '\0'; if(lookahead == '='){ lookahead = EOF; read = SGML_read_value((SGML_Object)stdin, (SGML_Method)getc, value, (SGML_Object)HTML_entities, HTML_expand_entity, sizeof(char), &lookahead); value[read] = '\0'; printf(" %s=\"%s\"", name, value); /* @@ */ } } printf(">\n"); } /* look for tag close */ while(lookahead != EOF){ if(lookahead == '>') lookahead = EOF; /* eat tag close */ else /* error: illegal char in markup */ lookahead = getc(stdin); } break; default: printf("__start_data__\n"); fwrite(buffer, sizeof(char), read, stdout); /* @@ */ printf("__end_data__\n"); } } exit(0); } @//E*O*F SGMLmain.c// chmod u=rw,g=rw,o=r SGMLmain.c echo x - SGMLstream.c sed 's/^@//' > "SGMLstream.c" <<'@//E*O*F SGMLstream.c//' /* SGML_stream.c * $Id$ */ /* implements... */ #include "SGMLstream.h" /* uses ... */ #include #include int SGML_read(stream, getc, buf, nbytes, entities, expand_entity, max_entity_length, declared_content, inout_lookahead) SGML_Object stream; SGML_Method getc; char* buf; int nbytes; int max_entity_length; SGML_Object entities; SGML_Method_charptrs expand_entity; int declared_content; int* inout_lookahead; { int c; /* state machine input character */ enum { /* state machine states */ start, data, and, and_hash, entity, lt, lt_slash, tag, pi, lt_bang, lt_bang_dash, comment, comment_dash, ps } state = start; /* auxiliary state: */ int cref; /* saw '#' after '&' */ int end_tag; /* saw '/' after '<' */ int ret = 0; /* number of characters read */ char name[SGML_NAMELEN + 1]; /* entity name */ int name_chars; #define LOOKAHEAD(n) (ret + n < nbytes) #define REDUCE(s) { state = (s); break; } #define SHIFT(s) { state = (s); continue; } #define DONE(c) { *inout_lookahead = (c); return ret; } #define WRITE(c) { *buf++ = (c); ret++; } /* check arguments */ if(max_entity_length < 2) max_entity_length = 2; /* prime the pump */ if((c = *inout_lookahead) == EOF) c = (getc)(stream); /* state machine...*/ while(ret < nbytes){ switch(state){ case start: if(c == EOF) return EOF; else if(c == '<'){ if(LOOKAHEAD(3)) { REDUCE(lt); } else { DONE(c); } /* no room for lookahead */ }else { SHIFT(data); } case data: if(c == EOF || c == '<') { DONE(c); } else if(c == '&' && declared_content != SGML_CDATA){ if(LOOKAHEAD(max_entity_length)) { REDUCE(and); } else { DONE(c); } /* no room to parse entity reference */ }else{ WRITE(c); break; } case and: if(c == '#') { REDUCE(and_hash); } else if(isalpha(c)) { cref = 0; name_chars = 0; SHIFT(entity); } else{ WRITE('&'); SHIFT(data); } case and_hash: if(isalnum(c)){ name_chars = 0; cref = 1; SHIFT(entity); } else{ WRITE('&'); WRITE('#'); SHIFT(data); } case entity: if(isdigit(c) || (cref == 0 && (isalpha(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c)))){ if(name_chars < SGML_NAMELEN) name[name_chars++] = c; /* else markup error: name too long */ break; } else{ int entlen; name[name_chars] = '\0'; entlen = (expand_entity)(entities, name, buf); ret += entlen; buf += entlen; if(c == ';') { REDUCE(data); } else /* terminate entity reference w/space or something */ { SHIFT(data); } } case lt: if(c == '/') { REDUCE(lt_slash); } if(declared_content == SGML_PCDATA){ if(c == '?') { REDUCE(pi); } else if(c == '!') { REDUCE(lt_bang); } else if(isalpha(c)) { end_tag = 0; SHIFT(tag); } } WRITE('<'); SHIFT(data); case lt_slash: if(isalpha(c)) { end_tag = 1; SHIFT(tag); } else { WRITE('<'); WRITE('/'); SHIFT(data); } case tag: ret = end_tag ? SGML_end_tag : SGML_start_tag; DONE(c); case pi: /* processing instruction (or markup declaraion) */ if(c == '>') { REDUCE(start); } else if(c == EOF) { SHIFT(start); } /* error: EOF in pic */ else break; case lt_bang: if(c == '-') { REDUCE(lt_bang_dash); } /* * *** NON CONFORMING IMPLEMENTATION *** * a letter here starts a markup declaration, which isn't supported * a [ starts a marked section, which isn't supported. * treat them like processing instructions. */ else if(c == '[' || isalpha(c)) { REDUCE(pi); } else{ WRITE('<'); WRITE('!'); SHIFT(data); } case lt_bang_dash: if(c == '-') { REDUCE(comment); } else{ WRITE('<'); WRITE('!'); WRITE('-'); SHIFT(data); } case comment: if(c == '-') { REDUCE(comment_dash); } else if(c == EOF) { DONE(c); } /* error: eof in comment */ else break; case comment_dash: if(c == '-') { REDUCE(ps); } else if(c == EOF) { DONE(c); }/* error: eof in comment */ else break; case ps: /* parameter separator between -- and > */ if(isspace(c)) break; else { REDUCE(start); }/* error if c !='>' */ } c = (getc)(stream); } DONE(c); /* set up lookahead for next call */ #undef S #undef LOOKAHEAD #undef REDUCE #undef SHIFT #undef DONE #undef WRITE } int SGML_read_name(stream, getc, buf, inout_lookahead) SGML_Object stream; SGML_Method getc; char* buf; int* inout_lookahead; { int name_chars = 0; int c = *inout_lookahead; if(!isalpha(c)) return 0; do{ if(name_chars <= SGML_NAMELEN) buf[name_chars++] = toupper(c); /* else error: name too long */ c = (getc)(stream); }while(isalnum(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c)); while(isspace(c)) c = (getc)(stream); *inout_lookahead = c; return name_chars; } int SGML_read_value (stream, getc, buf, entities, expand_entity, max_entity_length, inout_lookahead) SGML_Object stream; SGML_Method getc; char* buf; SGML_Object entities; SGML_Method_charptrs expand_entity; int max_entity_length; int* inout_lookahead; { int c; /* state machine input character */ enum { /* state machine states */ start, literal, and, and_hash, entity, #ifdef SGML_SHORTTAG value, #endif ps } state = start; /* auxiliary state: */ int cref; /* saw '#' after '&' */ char quote; /* which kind of quote */ int ret = 0; /* number of characters read */ char name[SGML_NAMELEN + 1]; /* entity name */ int name_chars; #define LOOKAHEAD(n) (ret + n < SGML_LITLEN) #define REDUCE(s) { state = (s); break; } #define SHIFT(s) { state = (s); continue; } #define DONE(c) { *inout_lookahead = (c); return ret; } #define WRITE(c) { *buf++ = (c); ret++; } /* check arguments */ if(max_entity_length < 2) max_entity_length = 2; /* enough for lookahead for &# processing */ /* prime the pump */ if((c = *inout_lookahead) == EOF) c = (getc)(stream); /* state machine...*/ while(ret < SGML_LITLEN){ switch(state){ case start: if(c == EOF) return EOF; else if(c == '"') { quote = c; REDUCE(literal); } else if(c == '\'') { quote = c; REDUCE(literal); } else if(isspace(c)) break; #ifdef SGML_SHORTTAG else if(isalnum(c)) { SHIFT(value); } #endif else { DONE(c); } /* error: illegal char in markup */ #ifdef SGML_SHORTTAG case value: if(c == EOF) { DONE(c); } #ifdef GROK_UNQUOTED_LITERALS else if(!(isspace(c) || c == '>')){ #else else if(isalnum(c) || strchr(SGML_UCNMCHAR SGML_LCNMCHAR, c)){ #endif WRITE(c); break; }else{ SHIFT(ps); } #endif case literal: if(c == EOF) { DONE(c); } else if(c == quote) { REDUCE(ps); } else if(c == '&'){ if(LOOKAHEAD(max_entity_length)) { REDUCE(and); } /* * *** NON CONFORMING IMPLEMENTATION *** * attribute value _might_ be too long (which would be an error). * we can't tell here, so we punt. */ else { DONE(c); } }else{ WRITE(c); break; } case and: if(c == '#') { REDUCE(and_hash); } else if(isalpha(c)) { cref = 0; name_chars = 0; SHIFT(entity); } else{ WRITE('&'); SHIFT(literal); } case and_hash: if(isalnum(c)){ cref = 1; name_chars = 0; SHIFT(entity); } else{ WRITE('&'); WRITE('#'); SHIFT(literal); } case entity: if(isdigit(c) || (cref == 0 && isalpha(c))){ if(name_chars < SGML_NAMELEN) name[name_chars++] = c; /* else markup error: name too long */ } else{ int entlen; name[name_chars] = '\0'; entlen = (expand_entity)(entities, name, buf); ret += entlen; buf += entlen; if(c == ';') { REDUCE(literal); } else /* terminate entity reference w/space or something */ { SHIFT(literal); } } case ps: /* parameter separator between attributes */ if(isspace(c)) break; else { DONE(c); } } c = (getc)(stream); } /* error: attribute value too long */ DONE(EOF); /* set lookahead to EOF for next call */ #undef S #undef LOOKAHEAD #undef REDUCE #undef SHIFT #undef DONE #undef WRITE } @//E*O*F SGMLstream.c// chmod u=rw,g=rw,o=r SGMLstream.c echo x - HTMLentities.c sed 's/^@//' > "HTMLentities.c" <<'@//E*O*F HTMLentities.c//' /* HTMLentities.c * $Id$ */ #include "HTMLentities.h" #include struct _entity_declaration HTML_entities[] ={ {"lt", '<'}, {"gt", '>'}, {"amp", '&'}, {"quot", '"'}, {"apos", '\''}, {0}, }; int HTML_expand_entity(entities, name, val) VOIDPTR entities; CONST char* name; char* val; { struct _entity_declaration *ed; for(ed = (struct _entity_declaration*)entities; ed->name; ed++){ if(strcmp(name, ed->name) == 0){ *val = ed->value; return sizeof(char); } } return 0; /* error: undefined entity */ } @//E*O*F HTMLentities.c// chmod u=rw,g=rw,o=r HTMLentities.c echo x - test.html sed 's/^@//' > "test.html" <<'@//E*O*F test.html//' HyperText Markup: Recommended Usage

Recommended HTML Usage

These constructs should work even on pretty broken implementations.

Text Elements

Most text elements consist of a start tag, some content, and an end tag. A start tag is an identifier surrouded by angle brackets. An end tag is an open angle bracket, a slash, an identifier, and a close bracket.

An identifier should be a letter followed by up to 7 letters or numbers.

No spaces are allowed between the tag open bracket and the identifier. Space is allowed between the identifier and the close bracket.

Some elements are "empty" and consist of only a start tag.

Paragraphs are separated by the "P" element.

Six levels of headings are supported:

Level three heading

Level four heading

five
six
Unordered lists:

  • This is the first item of an unordered list.
  • This is the second item. It's kinda long, and should wrap around on most screens.

  • This is the third item. It's only one paragraph, but it's got a paragraph tag at the end.

  • This is the fourth and final item.
Ordered lists:

  1. This is the first item of an unordered list.
  2. This is the second item. It's kinda long, and should wrap around on most screens.
  3. This is the third item -- you know, the one with the P element.

  4. This is the fourth and final item.
term
definition
another term
and its definition
The address element indicates the author or source of the document.
DWC

connolly@convex.com

Normal Text: PCDATA

Normal text is represented in HTML as parsed character data, #PCDATA. The characters '<', '>', and '&' should be represented as "&lt;", "&gt;", and "&amp;" respectively, lest they be interpreted as markup. Lines should not exceed 72 characters. Line breaks have no significance except to separate words.

Literal Text: RCDATA

Sections of literal text are represented in HTML as replaceable character data. Line breaks are significant, and characters are rendered in a fixed-width font to preserve horizontal formatting.

This is literal text. THIS word should line up under THIS word. There should be exactly three blank lines between here and here. The '&' character should be represented as "&amp;". The character sequence "</" must be represented as "&lt;/". The character sequence "]]>" must represented as "]]&gt;". SGML tags look like <start> and &lt;/end>. The marked section close delimiter looks like ]]&gt;. But ]] is just two close square brackets, and > is just a greater-than sign.

Document Description Elements

The TITLE element names the document. The content of the TITLE element is just character data, CDATA. It should be less than 72 characters, and it should contain no linebreaks, '<', '>', or '&' characters.

The ISINDEX tag appears at most one time, and it precedes all tags but TITLE and NEXTID.

Elements with Attributes

Some elements have associated named attributes. The values of the attributes of an element are specified in its start tag.

Attribute values are represented as RCDATA surrounded by double quotes. The character '"' must be represented as "&quot;" in an attribute value literal. The NEXTID tag appears at most one time, after the title and before the text elements.

@//E*O*F test.html// chmod u=rw,g=rw,o=r test.html echo x - SGMLstream.h sed 's/^@//' > "SGMLstream.h" <<'@//E*O*F SGMLstream.h//' /* SGML_stream.h * $Id */ #ifndef SGML_stream_h #define SGML_stream_h #include "c_dialect.h" /* * supported variations on the SGML declaration */ #ifdef SGML_DECLARATION #include SGML_DECLARATION #endif #ifndef SGML_NAMELEN #define SGML_NAMELEN 8 #endif #ifndef SGML_LITLEN #define SGML_LITLEN 240 #endif #ifndef SGML_SHORTTAG #define SGML_SHORTTAG 1 #endif #ifndef SGML_UCNMCHAR #define SGML_UCNMCHAR ".-" #endif #ifndef SGML_LCNMCHAR #define SGML_LCNMCHAR "" #endif /* * supported content types */ enum { SGML_CDATA, /* character data. recognize 3! * entities -- opaque entities object * expand_entity -- method for entities * (expand_entity)(entities, name, dest) * stores the expansion of name at dest, and returns the length * max_entity_length -- upper bound on return value of expand_entity * declared_content -- SGML_CDATA, SGML_RCDATA, or SGML_PCDATA * inout_lookahead -- EOF or first character of input * * POST: * returns value: * SGML_start_tag ==> sgml start tag found. *inout_lookahead * is first character of name. rest of name * follows on stream. * SGML_end_tag ==> sgml end tag found, like start tag * EOF ==> EOF found before any tags or data * (note that SGML_read may skip over comments, * processing insructions, and markup declarations * to get to EOF) * *inout_lookahead is not defined. * 0 ==> possible markup was found ('<' or '&') * and nbytes was not sufficient to determine * whether the character was markup or data, or * '&' found and nbytes is not suffient to expand * an entity. * NOTE: if nbytes >= max_entity_length * and nbytes > 3, SGML_read is guaranteed * not to return 0. * *inout_lookahead is set to the last value * read from stream. * 0 ret bytes of data found, followed by * '<', which may begin a tag, or * '&' with insufficient room to determine * whether it's data, or insufficient * room to expand the entity * nbytes ==> nbytes of data found. * *inout_lookahead = EOF (ready for next call) */ int SGML_read_name PARAMS((SGML_Object stream, SGML_Method getc, char* buf, int* inout_lookahead)); /* * PRE: * stream -- opaque stream object * getc -- getc method for stream. returns -1 on EOF * buf -- where to store name (must be at least SGML_NAMELEN chars) * inout_lookahead -- EOF or first character of input * * POST: * returns value: * 0 ==> first character is not a name. * 0 name is ret bytes long * folded to lower case and stored at buf. * trailing whitespace is skipped * *inout_lookahead = last value read from stream */ int SGML_read_value PARAMS((SGML_Object stream, SGML_Method getc, char* buf, SGML_Object entities, SGML_Method_charptrs expand_entity, int max_entity_length, int* inout_lookahead)); /* * PRE: * stream -- opaque stream object * getc -- getc method for stream. returns -1 on EOF * buf -- where to store value (must be at least SGML_LITLEN chars) * entities -- opaque entities object * expand_entity -- method for entities * (expand_entity)(entities, name, dest) * stores the expansion of name at dest, and returns the length * max_entity_length -- upper bound on return value of expand_entity * inout_lookahead -- EOF or first character of input * * POST: * returns value: * 0 ==> first character is not a letter, a digit, or a quote. * 0 value is ret bytes long * stored at buf with entities expanded * trailing whitespace is skipped * *inout_lookahead = last value read from stream */ #endif /* SGML_stream_h */ @//E*O*F SGMLstream.h// chmod u=rw,g=rw,o=r SGMLstream.h echo x - c_dialect.h sed 's/^@//' > "c_dialect.h" <<'@//E*O*F c_dialect.h//' /* c_dialect.h * $Id$ */ #ifndef c_dialect_h #define c_dialect_h /* __STDC__ is defined by ANSI C */ /* __stdc__ should be set to 1 for compilers that are not strictly conforming ANSI C compilers, and thus don't define __STDC__, but do support the following featers. (e.g. CONVEX C) */ /* K&R style C is the default */ #ifndef PARAMS #if defined(__STDC__) || defined(__stdc__) #define PARAMS(x) x #else #define PARAMS(x) () #endif #endif #ifndef CONST #if defined(__STDC__) || defined(__stdc__) #define CONST const #else #define CONST #endif #endif #ifndef VOIDPTR #if defined(__STDC__) || defined(__stdc__) #define VOIDPTR void* #else #define VOIDPTR char* #endif #endif #endif /* c_dialect_h */ @//E*O*F c_dialect.h// chmod u=rw,g=rw,o=r c_dialect.h echo x - HTML_SGML_decl.h sed 's/^@//' > "HTML_SGML_decl.h" <<'@//E*O*F HTML_SGML_decl.h//' /* HTML_SGML_decl.h * $Id$ */ #define SGML_NAMELEN 34 #define SGML_LITLEN 1024 #define SGML_SHORTTAG 1 #define GROK_UNQUOTED_LITERALS 1 @//E*O*F HTML_SGML_decl.h// chmod u=rw,g=rw,o=r HTML_SGML_decl.h echo x - HTMLentities.h sed 's/^@//' > "HTMLentities.h" <<'@//E*O*F HTMLentities.h//' /* HTMLentities.h * $Id$ */ #include "SGMLstream.h" typedef struct _entity_declaration{ char* name; char value; }; extern struct _entity_declaration HTML_entities[]; int HTML_expand_entity PARAMS((VOIDPTR entities, CONST char* name, char* val)); @//E*O*F HTMLentities.h// chmod u=rw,g=rw,o=r HTMLentities.h exit 0