#!/usr/bin/python """ slurpIcalSpec.py -- extract formal view of iCalendar/vCard specs produces XHTML with various typed links @@TODO: make each marked up item visible via css (unless they're all linked). see also: rfc2html in dev.w3.org @@ """ __version__ = '$Id: slurpIcalSpec.py,v 1.27 2005/11/09 23:10:49 connolly Exp $' #see also: changelog at end import sys import re class Usage(Exception): """USAGE: python slurpIcalSpec.py NNNN rfcNNNN.html where NNNN is one of the supported RFCS (so far: 2425, 2426, and 2445) """ # rfcnum: (footerStarters, typos, example_tags) Specs = { 2445: (("RFC", "Dawson"), (('"EVENT"', '"VEVENT"'), # 4.8.7.3 Last Modified (' Purpose This value type', ' Purpose: This value type'), ('11. Full Copyright Statement', '11 Full Copyright Statement'), ('3.11 Contact for Further Information:', '3.11 Contact for Further Information'), ), ('Example',), ), 2426: (("RFC", "Dawson"), # a ref is split across lines; put it on one line (('Hence, this [MIME-', 'Hence, this [MIME-DIR]'), ('DIR] profile is', ' profile is'), ), ('Type example',), ), 2425: (("RFC", "Howes"), (), (), ) } def main(argv): if len(argv) == 2: try: rfcnum = int(argv[1]) pgbrks, typos, exampleTags = Specs[rfcnum] except (ValueError, KeyError): raise Usage() else: raise Usage() fp = sys.stdin lines = depaginate(fp, pgbrks, typos) sections = bySection(lines) head = sections.next() #print >>sys.stderr, "RFC header:", `head` title, rfcnum, category, date, authlines = titleEtc(head) print htmlTop(title, rfcnum, category, date, authlines, exampleTags) w = sys.stdout.write sections = list(sections) refs = findRefs(sections) for sec in sections: if sec[0].find("Table of Contents") >= 0: tocSect(w, sec) elif sec[0][0].isdigit(): numSect(w, sec, refs) else: flowSect(w, sec, refs) print htmlBot() def depaginate(fp, footerStarters, typos): """undo RFC pagination: generate sequence of lines skip lines at top of page 'Each page must be limited to 58 lines followed by a form feed on a line by itself.' --Instructions to RFC Authors Postel & Reynolds Oct 1997 http://www.ietf.org/rfc/rfc2223 """ sv = [] top = 1 while 1: line = fp.readline() if not line: break line = line.rstrip("\r\n") for err, fix in typos: if err in line: line = line.replace(err, fix) for s in footerStarters: if line.startswith(s): line = "\f" break if line == "\f": sv = [''] # leave a blank line where the pagebreak was top = 1 continue if len(line): if sv: for l in sv: yield l sv = [] yield line top = 0 else: if not top: sv.append(line) def bySection(lines): sec = [] for l in lines: if len(l): if l.startswith(" ") or \ l.startswith("Request for Comments") or \ l.startswith("Category:") or \ l.find(":") >= 0 or \ l.find("<") >= 0 or \ l.find("--") >= 0: sec.append(l) else: if len(sec): yield sec sec = [] sec.append(l) else: sec.append(l) if len(sec): yield sec def titleEtc(lines): sep = None for idx in range(0, len(lines)): if lines[idx].strip() == '': sep = idx break if sep is None: raise ValueError, \ "no blank line separating header from title: " + `lines` title = ' '.join(map(lambda l: l.strip(), lines[sep+1:])).strip() rfcnum = lines[1][:40].strip().split()[-1] category = lines[2][:40].strip().split(':')[1].strip() date = lines[sep-1].strip() authlines = [] for idx in range(0, sep-1): authlines.append(lines[idx][40:].strip()) return title, rfcnum, category, date, authlines def findRefs(sections): for sec in sections: if not sec[0].strip().endswith("References"): continue refs = [] state = '' for ln in sec[1:]: ln = ln.strip() if ln == '': state = '' continue if state == '': junk, ln = ln.split('[', 1) ref, ln = ln.split(']', 1) #print >>sys.stderr, "found ref:", ref refs.append(ref) state = 'inref' return refs raise ValueError, "no References section found" def htmlTop(title, rfcnum, category, date, authlines, exampleTags): css = "" for t in exampleTags: css = css + "dd.%s { border-style: solid; border-color: #d0dbe7; border-width: 0 0 0 .25em; padding-left: 0.5em;\n" % asClass(t) html = """ %s
Network Working Group
Request for Comments: %s
Category: %s
%s
 %s

%s

$Revision: 1.27 $ of $Date: 2005/11/09 23:10:49 $ derived from rfc%s.txt and enhanced for gleaning formal description using slurpIcalSpec.py by Dan Connolly
""" % (title, css, rfcnum, category, '
'.join(authlines), date, title, rfcnum, rfcnum) return html def htmlBot(): return "" def flowSect(w, lines, refs): w("

%s

\n" % (lines[0],)) flowSectRest(w, lines, refs) def flowSectRest(w, lines, refs): p = 0 for l in lines[1:]: if l.strip(): if not p: w("

\n") bodyText(w, l, refs) w("\n") p = 1 else: if p: w("

\n") p = 0 if p: w("\n") w("
") def tocSect(w, lines): w("

%s

\n" % (lines[0],)) w("
") def refSect(w, lines): state = '' ref = None dd = None w("
\n") for ln in lines[1:]: ln = ln.strip() if ln == '': if dd: refEntry(w, ref, dd) state = '' continue if state == '': junk, ln = ln.split('[', 1) ref, dd = ln.split(']', 1) dd = dd + "\n" state = 'inref' elif state == 'inref': dd = dd + ln + "\n" if dd: refEntry(w, ref, dd) w("
\n") w("\n") def refEntry(w, ref, dd): """write a reference entry >>> import StringIO >>> w = StringIO.StringIO() >>> refEntry(w.write, 'IMIP', 'Dawson, F., Mansour, S. and S. Silverberg, "iCalendar Message-based Interoperability Protocol (IMIP)", RFC 2447, November 1998.'); w.getvalue() "
[IMIP]
\n
Dawson, F., Mansour, S. and S. Silverberg, iCalendar Message-based Interoperability Protocol (IMIP), RFC 2447, November 1998.
\n" """ w("
[%s]
\n" % (asID(ref), ref)) w("
") # try to mark up the title parts = dd.split('"') if len(parts) == 3: before, title, after = parts doChars(w, before) w("") # try to make it a link href = None if ref.startswith("RFC "): href = rfcAddr(ref.split(' ')[1]) else: m = re.search('RFC (\d\d\d\d?)', dd) if m: href = rfcAddr(m.group(1)) else: m = re.search(r'((http|ftp)://[^ ,]+)', dd) if m: href = m.group(1) if href: w("" % href) doChars(w, title) w("") else: doChars(w, title) w("") doChars(w, after) else: doChars(w, dd) w("
\n") def rfcAddr(num): return 'http://www.ietf.org/rfc/rfc%s' % num def asID(ref): """turn a reference label into an ID >>> asID('VCARD') 'ref_VCARD' >>> asID('RFC 1872') 'ref_RFC_1872' """ return 'ref_' + ref.replace(' ', '_') def numSect(w, lines, refs): num, head = lines[0].split(None, 1) w("

%s %s

\n" % (num, num, head)) #print >>sys.stderr, "numSect:", num, head if head == "References": refSect(w, lines) elif head == 'Full Copyright Statement' or \ head == 'Acknowledgements' or \ head == 'Acknowledgments' or \ head == 'Abstract': flowSectRest(w, lines, refs) elif lines[2].startswith(" Property Name:"): doStructuredSection(w, lines, refs, "Property", 'Property') elif lines[2].startswith(" Value Name:") or \ lines[2].startswith(" Value Name:"): # 4.8.3 indented oddly doStructuredSection(w, lines, refs, "Value") elif lines[2].startswith(" Component Name:"): doStructuredSection(w, lines, refs, "Component", 'Class') elif lines[2].startswith(" Parameter Name:"): doStructuredSection(w, lines, refs, "Parameter", 'Property') elif lines[2].startswith(" To: [email protected]"): doStructuredSection(w, lines, refs, "Type") else: w("
")
        for l in lines[1:]:
            bodyText(w, l, refs)
            w("\n")
        w("
\n
\n") def bodyText(w, txt, refs): """ write body text, linking refs >>> import StringIO >>> w = StringIO.StringIO() >>> bodyText(w.write, 'abc [def] ghi', ['def']); w.getvalue() "abc [def] ghi" """ for part in txt.split('['): for ref in refs: if part.startswith(ref + ']'): junk, part = part.split(']') w("[%s]" % (asID(ref), ref)) break doChars(w, part) def doChars(w, txt): w(txt.replace("&", "&").replace("<", "<")) import string # ala iana-token = 1*(ALPHA / DIGIT / "-") NAMECHARS = string.letters + string.digits + '-' Tags = ('Purpose', 'Formal Definition', 'Value Type', 'Property Parameters', 'Property Parameter', 'Conformance', 'Description', 'Format Definition', 'Example', 'To', 'Subject', 'Type name', 'Type purpose', 'Type encoding', 'Type value', 'Type special notes', 'Type example' ) def doStructuredSection(w, lines, refs, secType, rdfClass=None): w("
\n") secLabel = '%s Name' % secType dt = '' dd = [] idx = 2 while idx < len(lines): l = lines[idx] if ':' in l: hd, rest = l.lstrip().split(":", 1) if hd == secLabel or hd in Tags: if dd: subSect(w, secType, rdfClass, dt, dd, refs) dd = [] dt = hd dd.append(rest) else: dd.append(l) else: dd.append(l) idx += 1 if dd: subSect(w, secType, rdfClass, dt, dd, refs) w("
\n") w("\n") def subSect(w, secType, rdfClass, dt, dd, refs): if dt.endswith(" Name") or dt == 'Type name': name = ''.join(dd).strip() # VEVENT is quoted extraneously if name[0] == '"': name = name[1:-1] dd[0] = name if name.startswith("Any property name with"): name = "X-" elif rdfClass: name = camelCase(name, rdfClass == 'Class') else: name = secType + "_" + name w("
%s
\n" % (name, dt)) else: w("
%s
\n" % (dt,)) w("
" % (asClass(dt),)) if dt == 'Value Type': rest = dd[0] rel='value-type' if '.' in rest: name, rest = rest.split('.', 1) name = name.strip() rest = '.' + rest else: name = rest.strip() rest = '' if name.startswith("The default"): rel='default-value-type' txt = name name = txt.split()[-1] w(txt[:-len(name)]) if 'separated' in rest: rel = 'list-of' w("%s
   %s\n"
          % (rel, name, name, rest))

        for l in dd[1:]:
            l = tokenRefs(w, l, 'allowed-type',
                          {'DATE': 'Value_DATE',
                           'DATE-TIME': 'Value_DATE-TIME',
                           'PERIOD': 'Value_PERIOD',
                           'BINARY': 'Value_BINARY'})
            bodyText(w, l, refs)
            w("\n")

    else:
        tokens = None
        rel = None
        
        if secType == "Property" and \
                        (dt == 'Conformance' or dt == 'Description'):
            rel = 'applies-to'
            tokens = {'VEVENT': 'Vevent',
                      'VTODO': 'Vtodo',
                      'VJOURNAL': 'Vjournal',
                      'VFREEBUSY': 'Vfreebusy',
                      'VTIMEZONE': 'Vtimezone',
                      'VALARM': 'Valarm'
                      }
        elif dt == 'Description' and secType == "Component":
            rel = 'def'
            tokens = {
                'STANDARD': 'standard',
                'DAYLIGHT': 'daylight',
                }
        elif dt == 'Description' and secType == "Value":
            rel = 'def'
            tokens = {
                'FREQ': 'freq',
                'UNTIL': 'until',
                'COUNT': 'count',
                'INTERVAL': 'interval',
                'BYSECOND': 'bysecond',
                'BYMINUTE': 'byminute',
                'BYHOUR': 'byhour',
                'BYDAY': 'byday',
                'BYMONTHDAY': 'bymonthday',
                'BYYEARDAY': 'byyearday',
                'BYWEEKNO': 'byweekno',
                'BYMONTH': 'bymonth',
                'BYSETPOS': 'bysetpos',
                'WKST': 'wkst',
                }

        w("
   ")

        for l in dd:
            if tokens: l = tokenRefs(w, l, rel, tokens)
            bodyText(w, l, refs)
            w("\n")

    w("
\n
\n") def asClass(t): """return heading tag t as a class name """ return t.replace(" ", '') def tokenRefs(w, l, rel, tokens): pat = re.compile('|'.join(tokens.keys())) seen = {} while l: m = pat.search(l) if not m: break doChars(w, l[:m.start()]) t = l[m.start():m.end()] if rel == 'def' and not seen.has_key(tokens[t]): w('%s' % (tokens[t], rel, tokens[t], t)) seen[tokens[t]] = 1 else: w('%s' % (rel, tokens[t], t)) l = l[m.end():] return l def camelCase(n, initialCap=0): words = map(lambda w: w.lower(), n.split('-')) def ucfirst(w): return w[0].upper() + w[1:] if initialCap: return ''.join(map(ucfirst, words)) else: return words[0] + ''.join(map(ucfirst, words[1:])) def _test(): import doctest doctest.testmod() if __name__ == '__main__': if '--test' in sys.argv: _test() else: try: main(sys.argv) except Usage, e: print >>sys.stderr, e.__doc__ # $Log: slurpIcalSpec.py,v $ # Revision 1.27 2005/11/09 23:10:49 connolly # - changed the way duration values are modelled # The iCalendar DURATION value type is actually more than just a # XMLSchema.duration; it also has a RELATED parameter. # So for # TRIGGER;VALUE=DURATION;RELATED=START:-PT15M # we'll write # { ?E cal:trigger [ rdf:value "-PT15M"^^xsdt:duration; # cal:related "START"] } # # - fixed test data to have rdf:datatype on integer # values, to match the schema (which matches the RFC) # # - fixed schema to show DATE-TIME properties (dtstart, ...) # as DatatypeProperties # (there are little/no tests for PERIOD; beware) # # - scraped more details about property parameters (e.g. partstat, cn, # cutype, ...) and rrule parts (freq, interval, ...) from the RFC so # that they show up as links in the hypertext version and as RDF # properties in the schema. likewise timezone components (standard, # daylight) # - side effect: added some whitespace in rfc2445.html # # - demoted x- properties # - removed x- properties from .rdf versions of test data # this allows the round-trip tests to pass # - fromIcal.py doesn't output them unless you give the --x option # # - added Makefile support for consistency checking with pellet # # - demoted blank line diagnostic in fromIcal.py to a comment # # - silenced some left-over debug diagnostics in slurpIcalSpec.py # # - fixed test/test-created.rdf; added it to fromIcalTest.py list # # Revision 1.26 2005/07/22 21:14:32 connolly # remove : from iCalendar heading # # Revision 1.25 2005/07/22 21:00:00 connolly # - added support for RFC2425, which has # - numbered Abstract and TOC # - examples that start in column 1 # # Revision 1.24 2005/07/22 20:42:12 connolly # - handle VCARD structured section tags # - working on example extraction; started with CSS style # - no bullets on TOC items; just the numbers # - handle a ref split across lines in VCARD as a couple typos # # Revision 1.23 2005/07/22 19:51:28 connolly # - parameterize RFC-specific bits so it works for RFC2426 also # - factor out typo handling # - take RFC number on command line; write diagnostic for incorrect usage # # Revision 1.22 2005/07/22 19:28:18 connolly # - mark up titles in bibliography; make links to RFCs # - render copyright, acks sections flowed rather than pre # - fix extra . at end of section ID # # Revision 1.21 2005/07/22 18:49:58 connolly # - handle references in 2 passes # - 1st pass to find ref labels in refs section # - 2nd pass to format references from the body and the bibliography # - added some unit tests and a --test option # - handle unnumbered overview section in TOC # # Revision 1.20 2004/02/29 14:52:00 connolly # new grddl names # # Revision 1.19 2004/02/12 06:31:23 connolly # fix EVENT to VEVENT typo # # Revision 1.18 2004/02/08 03:30:54 connolly # allow or odd indentation of 4.3.8 Integer # # Revision 1.17 2004/02/08 00:06:03 connolly # find domain info in Descriptions of properties as well as Conformance # # Revision 1.16 2004/02/07 06:30:12 connolly # take out broken conformance links to Vcalendar # # Revision 1.15 2004/02/07 06:02:02 connolly # - links from property conformance subsections to components # # Revision 1.14 2004/02/07 05:31:21 connolly # - handle Property Name: Any ... X- # - add purposes to formal schema as rdfs:comment # # Revision 1.13 2004/02/07 05:21:33 connolly # - simplify subSect # - fix a typo in RFC 2445 # # Revision 1.12 2004/02/07 04:55:50 connolly # - refactored doStructuredSection to collect dd lines # - use doChars() to fix a bug noted in a comment # - removed deblank (dead code) # - removed list(lines) (debugging code) # # Revision 1.11 2004/02/07 04:30:41 connolly # - use generators for depagination, section splitting # - cite RFC guidelines RFC # - factor out some hard-coded strings # # Revision 1.10 2004/02/07 02:39:10 connolly # doStructuredSection was getting out of hand; # refactored it before working on Conformance section # # Revision 1.9 2004/02/07 00:04:37 connolly # find more value type info; turn into allowed-type links # # Revision 1.8 2004/02/01 07:43:16 connolly # recognize (though do not fully handle) value types with defaults # # Revision 1.7 2004/02/01 06:55:11 connolly # add provenance in address element # # Revision 1.6 2004/01/30 01:15:39 connolly # fixed case/hypenation # # Revision 1.5 2004/01/29 19:40:47 connolly # added profile for GRDDL # # Revision 1.4 2004/01/29 16:49:32 connolly # first steps towards gleaning a schema from RFC2445 via XHTML, XSLT # # Revision 1.3 2004/01/28 10:29:46 connolly # handle (some cases) of prose in the Value Type field # handle more section types # # Revision 1.2 2004/01/28 10:02:03 connolly # groks quite a bit more structure # # Revision 1.1 2004/01/28 08:54:24 connolly # produces pretty reasonable XHTML #