#!/usr/bin/python
""" slurpIcalSpec.py -- extract formal view of iCalendar/vCard specs
produces XHTML with various typed links
@@TODO: make each marked up item visible via css
(unless they're all linked).
see also: rfc2html in dev.w3.org @@
"""
__version__ = '$Id: slurpIcalSpec.py,v 1.27 2005/11/09 23:10:49 connolly Exp $'
#see also: changelog at end
import sys
import re
class Usage(Exception):
"""USAGE: python slurpIcalSpec.py NNNN rfcNNNN.html
where NNNN is one of the supported RFCS
(so far: 2425, 2426, and 2445)
"""
# rfcnum: (footerStarters, typos, example_tags)
Specs = { 2445: (("RFC", "Dawson"),
(('"EVENT"', '"VEVENT"'), # 4.8.7.3 Last Modified
(' Purpose This value type',
' Purpose: This value type'),
('11. Full Copyright Statement',
'11 Full Copyright Statement'),
('3.11 Contact for Further Information:',
'3.11 Contact for Further Information'),
),
('Example',),
),
2426: (("RFC", "Dawson"),
# a ref is split across lines; put it on one line
(('Hence, this [MIME-',
'Hence, this [MIME-DIR]'),
('DIR] profile is',
' profile is'),
),
('Type example',),
),
2425: (("RFC", "Howes"),
(),
(),
)
}
def main(argv):
if len(argv) == 2:
try:
rfcnum = int(argv[1])
pgbrks, typos, exampleTags = Specs[rfcnum]
except (ValueError, KeyError):
raise Usage()
else:
raise Usage()
fp = sys.stdin
lines = depaginate(fp, pgbrks, typos)
sections = bySection(lines)
head = sections.next()
#print >>sys.stderr, "RFC header:", `head`
title, rfcnum, category, date, authlines = titleEtc(head)
print htmlTop(title, rfcnum, category, date, authlines, exampleTags)
w = sys.stdout.write
sections = list(sections)
refs = findRefs(sections)
for sec in sections:
if sec[0].find("Table of Contents") >= 0:
tocSect(w, sec)
elif sec[0][0].isdigit():
numSect(w, sec, refs)
else:
flowSect(w, sec, refs)
print htmlBot()
def depaginate(fp, footerStarters, typos):
"""undo RFC pagination: generate sequence of lines
skip lines at top of page
'Each page must be limited to 58 lines followed by a form feed on a
line by itself.'
--Instructions to RFC Authors
Postel & Reynolds Oct 1997
http://www.ietf.org/rfc/rfc2223
"""
sv = []
top = 1
while 1:
line = fp.readline()
if not line: break
line = line.rstrip("\r\n")
for err, fix in typos:
if err in line:
line = line.replace(err, fix)
for s in footerStarters:
if line.startswith(s):
line = "\f"
break
if line == "\f":
sv = [''] # leave a blank line where the pagebreak was
top = 1
continue
if len(line):
if sv:
for l in sv:
yield l
sv = []
yield line
top = 0
else:
if not top:
sv.append(line)
def bySection(lines):
sec = []
for l in lines:
if len(l):
if l.startswith(" ") or \
l.startswith("Request for Comments") or \
l.startswith("Category:") or \
l.find(":") >= 0 or \
l.find("<") >= 0 or \
l.find("--") >= 0:
sec.append(l)
else:
if len(sec):
yield sec
sec = []
sec.append(l)
else:
sec.append(l)
if len(sec):
yield sec
def titleEtc(lines):
sep = None
for idx in range(0, len(lines)):
if lines[idx].strip() == '':
sep = idx
break
if sep is None:
raise ValueError, \
"no blank line separating header from title: " + `lines`
title = ' '.join(map(lambda l: l.strip(), lines[sep+1:])).strip()
rfcnum = lines[1][:40].strip().split()[-1]
category = lines[2][:40].strip().split(':')[1].strip()
date = lines[sep-1].strip()
authlines = []
for idx in range(0, sep-1):
authlines.append(lines[idx][40:].strip())
return title, rfcnum, category, date, authlines
def findRefs(sections):
for sec in sections:
if not sec[0].strip().endswith("References"): continue
refs = []
state = ''
for ln in sec[1:]:
ln = ln.strip()
if ln == '':
state = ''
continue
if state == '':
junk, ln = ln.split('[', 1)
ref, ln = ln.split(']', 1)
#print >>sys.stderr, "found ref:", ref
refs.append(ref)
state = 'inref'
return refs
raise ValueError, "no References section found"
def htmlTop(title, rfcnum, category, date, authlines, exampleTags):
css = ""
for t in exampleTags:
css = css + "dd.%s { border-style: solid; border-color: #d0dbe7; border-width: 0 0 0 .25em; padding-left: 0.5em;\n" % asClass(t)
html = """
%s
Network Working Group
Request for Comments: %s
Category: %s
%s
%s
%s
$Revision: 1.27 $ of $Date: 2005/11/09 23:10:49 $
derived from rfc%s.txt
and enhanced
for gleaning formal description
using slurpIcalSpec.py
by Dan Connolly
""" % (title, css, rfcnum, category, ' '.join(authlines), date, title,
rfcnum, rfcnum)
return html
def htmlBot():
return ""
def flowSect(w, lines, refs):
w("
%s
\n" % (lines[0],))
flowSectRest(w, lines, refs)
def flowSectRest(w, lines, refs):
p = 0
for l in lines[1:]:
if l.strip():
if not p: w("
\n")
bodyText(w, l, refs)
w("\n")
p = 1
else:
if p: w("
\n")
p = 0
if p: w("\n")
w("
")
def tocSect(w, lines):
w("
%s
\n" % (lines[0],))
w("
\n")
for l in lines[1:]:
l = l.strip()
if l == '': continue
if l[0].isdigit():
num, l = l.split(None, 1)
else: num = ''
head = l.split(".", 1)[0]
w("
")
# try to mark up the title
parts = dd.split('"')
if len(parts) == 3:
before, title, after = parts
doChars(w, before)
w("")
# try to make it a link
href = None
if ref.startswith("RFC "):
href = rfcAddr(ref.split(' ')[1])
else:
m = re.search('RFC (\d\d\d\d?)', dd)
if m:
href = rfcAddr(m.group(1))
else:
m = re.search(r'((http|ftp)://[^ ,]+)', dd)
if m:
href = m.group(1)
if href:
w("" % href)
doChars(w, title)
w("")
else:
doChars(w, title)
w("")
doChars(w, after)
else:
doChars(w, dd)
w("
\n")
def rfcAddr(num):
return 'http://www.ietf.org/rfc/rfc%s' % num
def asID(ref):
"""turn a reference label into an ID
>>> asID('VCARD')
'ref_VCARD'
>>> asID('RFC 1872')
'ref_RFC_1872'
"""
return 'ref_' + ref.replace(' ', '_')
def numSect(w, lines, refs):
num, head = lines[0].split(None, 1)
w("
%s %s
\n" % (num, num, head))
#print >>sys.stderr, "numSect:", num, head
if head == "References":
refSect(w, lines)
elif head == 'Full Copyright Statement' or \
head == 'Acknowledgements' or \
head == 'Acknowledgments' or \
head == 'Abstract':
flowSectRest(w, lines, refs)
elif lines[2].startswith(" Property Name:"):
doStructuredSection(w, lines, refs, "Property", 'Property')
elif lines[2].startswith(" Value Name:") or \
lines[2].startswith(" Value Name:"): # 4.8.3 indented oddly
doStructuredSection(w, lines, refs, "Value")
elif lines[2].startswith(" Component Name:"):
doStructuredSection(w, lines, refs, "Component", 'Class')
elif lines[2].startswith(" Parameter Name:"):
doStructuredSection(w, lines, refs, "Parameter", 'Property')
elif lines[2].startswith(" To: [email protected]"):
doStructuredSection(w, lines, refs, "Type")
else:
w("
")
for l in lines[1:]:
bodyText(w, l, refs)
w("\n")
w("
\n")
secLabel = '%s Name' % secType
dt = ''
dd = []
idx = 2
while idx < len(lines):
l = lines[idx]
if ':' in l:
hd, rest = l.lstrip().split(":", 1)
if hd == secLabel or hd in Tags:
if dd:
subSect(w, secType, rdfClass, dt, dd, refs)
dd = []
dt = hd
dd.append(rest)
else:
dd.append(l)
else:
dd.append(l)
idx += 1
if dd:
subSect(w, secType, rdfClass, dt, dd, refs)
w("
\n")
w("\n")
def subSect(w, secType, rdfClass, dt, dd, refs):
if dt.endswith(" Name") or dt == 'Type name':
name = ''.join(dd).strip()
# VEVENT is quoted extraneously
if name[0] == '"':
name = name[1:-1]
dd[0] = name
if name.startswith("Any property name with"):
name = "X-"
elif rdfClass:
name = camelCase(name, rdfClass == 'Class')
else:
name = secType + "_" + name
w("
%s
\n" % (name, dt))
else:
w("
%s
\n" % (dt,))
w("
" % (asClass(dt),))
if dt == 'Value Type':
rest = dd[0]
rel='value-type'
if '.' in rest:
name, rest = rest.split('.', 1)
name = name.strip()
rest = '.' + rest
else:
name = rest.strip()
rest = ''
if name.startswith("The default"):
rel='default-value-type'
txt = name
name = txt.split()[-1]
w(txt[:-len(name)])
if 'separated' in rest:
rel = 'list-of'
w("%s
")
for l in dd:
if tokens: l = tokenRefs(w, l, rel, tokens)
bodyText(w, l, refs)
w("\n")
w("
\n
\n")
def asClass(t):
"""return heading tag t as a class name
"""
return t.replace(" ", '')
def tokenRefs(w, l, rel, tokens):
pat = re.compile('|'.join(tokens.keys()))
seen = {}
while l:
m = pat.search(l)
if not m: break
doChars(w, l[:m.start()])
t = l[m.start():m.end()]
if rel == 'def' and not seen.has_key(tokens[t]):
w('%s' % (tokens[t], rel,
tokens[t], t))
seen[tokens[t]] = 1
else:
w('%s' % (rel, tokens[t], t))
l = l[m.end():]
return l
def camelCase(n, initialCap=0):
words = map(lambda w: w.lower(), n.split('-'))
def ucfirst(w):
return w[0].upper() + w[1:]
if initialCap:
return ''.join(map(ucfirst, words))
else:
return words[0] + ''.join(map(ucfirst, words[1:]))
def _test():
import doctest
doctest.testmod()
if __name__ == '__main__':
if '--test' in sys.argv:
_test()
else:
try:
main(sys.argv)
except Usage, e:
print >>sys.stderr, e.__doc__
# $Log: slurpIcalSpec.py,v $
# Revision 1.27 2005/11/09 23:10:49 connolly
# - changed the way duration values are modelled
# The iCalendar DURATION value type is actually more than just a
# XMLSchema.duration; it also has a RELATED parameter.
# So for
# TRIGGER;VALUE=DURATION;RELATED=START:-PT15M
# we'll write
# { ?E cal:trigger [ rdf:value "-PT15M"^^xsdt:duration;
# cal:related "START"] }
#
# - fixed test data to have rdf:datatype on integer
# values, to match the schema (which matches the RFC)
#
# - fixed schema to show DATE-TIME properties (dtstart, ...)
# as DatatypeProperties
# (there are little/no tests for PERIOD; beware)
#
# - scraped more details about property parameters (e.g. partstat, cn,
# cutype, ...) and rrule parts (freq, interval, ...) from the RFC so
# that they show up as links in the hypertext version and as RDF
# properties in the schema. likewise timezone components (standard,
# daylight)
# - side effect: added some whitespace in rfc2445.html
#
# - demoted x- properties
# - removed x- properties from .rdf versions of test data
# this allows the round-trip tests to pass
# - fromIcal.py doesn't output them unless you give the --x option
#
# - added Makefile support for consistency checking with pellet
#
# - demoted blank line diagnostic in fromIcal.py to a comment
#
# - silenced some left-over debug diagnostics in slurpIcalSpec.py
#
# - fixed test/test-created.rdf; added it to fromIcalTest.py list
#
# Revision 1.26 2005/07/22 21:14:32 connolly
# remove : from iCalendar heading
#
# Revision 1.25 2005/07/22 21:00:00 connolly
# - added support for RFC2425, which has
# - numbered Abstract and TOC
# - examples that start in column 1
#
# Revision 1.24 2005/07/22 20:42:12 connolly
# - handle VCARD structured section tags
# - working on example extraction; started with CSS style
# - no bullets on TOC items; just the numbers
# - handle a ref split across lines in VCARD as a couple typos
#
# Revision 1.23 2005/07/22 19:51:28 connolly
# - parameterize RFC-specific bits so it works for RFC2426 also
# - factor out typo handling
# - take RFC number on command line; write diagnostic for incorrect usage
#
# Revision 1.22 2005/07/22 19:28:18 connolly
# - mark up titles in bibliography; make links to RFCs
# - render copyright, acks sections flowed rather than pre
# - fix extra . at end of section ID
#
# Revision 1.21 2005/07/22 18:49:58 connolly
# - handle references in 2 passes
# - 1st pass to find ref labels in refs section
# - 2nd pass to format references from the body and the bibliography
# - added some unit tests and a --test option
# - handle unnumbered overview section in TOC
#
# Revision 1.20 2004/02/29 14:52:00 connolly
# new grddl names
#
# Revision 1.19 2004/02/12 06:31:23 connolly
# fix EVENT to VEVENT typo
#
# Revision 1.18 2004/02/08 03:30:54 connolly
# allow or odd indentation of 4.3.8 Integer
#
# Revision 1.17 2004/02/08 00:06:03 connolly
# find domain info in Descriptions of properties as well as Conformance
#
# Revision 1.16 2004/02/07 06:30:12 connolly
# take out broken conformance links to Vcalendar
#
# Revision 1.15 2004/02/07 06:02:02 connolly
# - links from property conformance subsections to components
#
# Revision 1.14 2004/02/07 05:31:21 connolly
# - handle Property Name: Any ... X-
# - add purposes to formal schema as rdfs:comment
#
# Revision 1.13 2004/02/07 05:21:33 connolly
# - simplify subSect
# - fix a typo in RFC 2445
#
# Revision 1.12 2004/02/07 04:55:50 connolly
# - refactored doStructuredSection to collect dd lines
# - use doChars() to fix a bug noted in a comment
# - removed deblank (dead code)
# - removed list(lines) (debugging code)
#
# Revision 1.11 2004/02/07 04:30:41 connolly
# - use generators for depagination, section splitting
# - cite RFC guidelines RFC
# - factor out some hard-coded strings
#
# Revision 1.10 2004/02/07 02:39:10 connolly
# doStructuredSection was getting out of hand;
# refactored it before working on Conformance section
#
# Revision 1.9 2004/02/07 00:04:37 connolly
# find more value type info; turn into allowed-type links
#
# Revision 1.8 2004/02/01 07:43:16 connolly
# recognize (though do not fully handle) value types with defaults
#
# Revision 1.7 2004/02/01 06:55:11 connolly
# add provenance in address element
#
# Revision 1.6 2004/01/30 01:15:39 connolly
# fixed case/hypenation
#
# Revision 1.5 2004/01/29 19:40:47 connolly
# added profile for GRDDL
#
# Revision 1.4 2004/01/29 16:49:32 connolly
# first steps towards gleaning a schema from RFC2445 via XHTML, XSLT
#
# Revision 1.3 2004/01/28 10:29:46 connolly
# handle (some cases) of prose in the Value Type field
# handle more section types
#
# Revision 1.2 2004/01/28 10:02:03 connolly
# groks quite a bit more structure
#
# Revision 1.1 2004/01/28 08:54:24 connolly
# produces pretty reasonable XHTML
#