src/fc2rss/fc2rss.py
"""fc2rss
Screen scrape an OU FirstClass conference or two into RSS files.
Avoid the appalling FC interface!
Stuart Langridge, http://www.kryogenix.org/
v1.0 2003年10月11日
"""
import urllib,re,urlparse,os,cgi
# Required variables
OU_USERNAME = 'YOUR_OU_USERNAME'
OU_PASSWORD = 'YOUR_OU_PASSWORD'
CONFERENCES = [
'http://oufcnt2.open.ac.uk/Login/0007584D-80000001/', # m874
'http://oufcnt2.open.ac.uk/Login/00075E75-80000001/' # m879
]
RSS_PATH = '/home/httpd/test/ou/rss'
debugmode = 0
reTitle = re.compile(r'^\s*<title>(?P<title>[^<]+)</title>\s*$')
reItem = re.compile(r'\s*leaf\[\d+\]=new Array\(\d+,\d+,\d+,\d+,"(?P<id>[^"]+)", "(?P<author>[^"]+)"\);\s*$')
reDiv = re.compile(r'\s*<div ')
reEndDiv = re.compile(r'\s*</div>')
reHTML = re.compile('<[^>]+>')
ws = re.compile(r'[\x0b\x0c\n\r]')
# Handle required authentication
class myOpener(urllib.FancyURLopener):
def prompt_user_passwd(self,host,realm):
return (OU_USERNAME,OU_PASSWORD)
urllib._urlopener = myOpener()
def parseConf(conf):
title = ''
items = []
fp = urllib.urlopen(conf)
data = ws.split(fp.read())
for line in data:
t = reTitle.match(line)
if t: title = t.groupdict()['title']
if t and debugmode: print '---%s---' % title
i = reItem.match(line)
if i:
title2 = ''
text2 = ''
capture = 0
iid = i.groupdict()['id']
iauth = i.groupdict()['author']
itemuri = urlparse.urljoin(conf,iid)
fp2 = urllib.urlopen(itemuri)
data2 = ws.split(fp2.read())
for line2 in data2:
t2 = reTitle.match(line2)
if t2: title2 = t2.groupdict()['title']
if t2 and debugmode: print ' %s' % title2
divs = reDiv.match(line2)
if divs: capture = 1
if capture: text2 += line2
dive = reEndDiv.search(line2)
if dive: capture = 0
fp2.close()
text2 = re.sub('<br>','\n',text2)
text2 = reHTML.sub('',text2)
text2 = re.sub('\n','<br>',text2)
text2 = re.sub('\x00','',text2)
items.append((iid,iauth,itemuri,title2,text2))
fp.close()
return title,items
# Walk the conferences
confdata = {}
for conf in CONFERENCES:
title,items = parseConf(conf)
if title and items: confdata[title] = (conf,items)
for conf in confdata.keys():
fp = open(os.path.join(RSS_PATH,conf.replace(' ','_')) + '.xml','w')
url,items = confdata[conf]
fp.write('''<rss version="2.0">
<channel>
<title>%s</title>
<link>%s</link>
<description>%s</description>
''' % (conf,url,conf))
for item in items:
iid,auth,uri,title,text = item
fp.write(''' <item>
<author>%s</author>
<description>%s</description>
<link>%s</link>
<title>%s</title>
</item>''' % (auth,cgi.escape(text),uri,title))
fp.write(' </channel>\n</rss>\n')