Monday 23 June 2008

An RSS feed for the CCL list

Apparently some people still read the CCL (Computational Chemistry List) using email. I gave up on that some time ago. Here's an RSS feed I threw together some time ago, and which you might find useful: CCL feed

If you don't know what an RSS feed is, and how it might be useful, it's quicker just to test it out than to explain. First of all, you need a feed reader: for example get an account on Google Reader, a free online RSS reader. Next, subscribe to whatever RSS feeds you want, by clicking on "Add subscription" and copying and pasting the URL of the RSS feed into the box that appears.

Here's how the feed is created:
import sys
import email
import datetime
import pdb

from ftplib import FTP
from StringIO import StringIO

import PyRSS2Gen

def breakdaily(messages):
"""
>>> import pickle
>>> a = pickle.load(open("tmp20070105"))
>>> len(breakdaily(a))
1
"""
broken = []
message = []
for line in messages.split("\n"):
if line.startswith("From owner-chemistry@ccl.net"):
broken.append("\n".join(message))
message = []
else:
message.append(line)
broken.append("\n".join(message))
return broken[1:]

def getlatest(N):
ftp = FTP('ftp.ccl.net')
ftp.login('anonymous')
ftp.cwd('/pub/chemistry/archived-messages')
# ftp.dir()

listoffiles = ftp.nlst()

# Get current year
year = datetime.datetime.now().year

# Get N most recent messages
messagetot = 0
months = [str(x).zfill(2) for x in range(1, 13)]
months.reverse()
days = [str(x).zfill(2) for x in range(1, 32)]
days.reverse()
msgs = []


while messagetot<N:
ftp.cwd(str(year))

for month in months:
ftp.cwd(month)
availabledays = ftp.nlst()
for day in [x for x in days if x in availabledays]:
# Go thru in reverse order but exclude days that are
# non-existent
messages = StringIO()
ftp.retrbinary("RETR %s" % day, messages.write)
## pickle.dump(messages.getvalue(), open("tmp%i%s%s" % (year,month,day), "w"))
listmsgs = breakdaily(messages.getvalue())
# print "="*24 + "\n", messages.getvalue()
for i,msg in enumerate(listmsgs):
msg_content = email.message_from_string(msg)
text = ""
for part in msg_content.walk():
if (part.get_content_maintype()=="text" and
part.get_content_type()=="text/plain"):
text = part.get_payload()
text = text.replace("\n","<br/>").decode("iso-8859-1", "strict")
msgs.append( (year, month, day, msg_content, i+1, text) )
messagetot += len(listmsgs)
if messagetot>=N:
break
if messagetot>=N:
break
ftp.cwd("..")
ftp.cwd("..")
year -= 1 # Continue into the previous year

ftp.quit()
msgs.reverse()

return msgs

def main():

print "\nStarting..."

messages = getlatest(100)
## outputfile = open("messages.pickle", "w")
## pickle.dump(messages, outputfile)
## outputfile.close()
## import sys
## sys.exit(1)
## messages = pickle.load(open("messages.pickle", "r"))

rssitems = []
for year, month, day, msg, id, messagetext in messages:

# Add the new item
newitem = PyRSS2Gen.RSSItem(
title = msg['Subject'],
link = "http://ccl.net/cgi-bin/ccl/message-new?%d+%s+%s+%s" % (
year, month, day, str(id).zfill(3)),
description = messagetext,
## What's a guid? A globally unique id...used by RSS readers
## to determine whether they've seen a particular news item
## already
guid = PyRSS2Gen.Guid("http://ccl.net/cgi-bin/ccl/message-new?%d+%s+%s+%s" % (
year, month, day, str(id).zfill(3))),
pubDate = msg['Date']
)
rssitems.append(newitem)

rss = PyRSS2Gen.RSS2(
title = "CCL",
link = "http://www.ccl.net",
description = "RSS Feed of the world's greatest computational chemistry "
"mailing list, chemistry@ccl.net (the CCL list)",
lastBuildDate = datetime.datetime.now(),
items = rssitems)
rss.write_xml(open("ccl.rss", "w"))

print "Finishing...\n"

def test():
import doctest
doctest.testmod()

def do_debugger(type, value, tb):
pdb.pm()

if __name__=="__main__":

# sys.excepthook = do_debugger
main()

## test()

6 comments:

ChemSpiderman said...

Thanks for creating that feed...less emails now. Yay!

Karol said...

It seems your feed does not validate... so it is not usable with some readers, like Brief.

Noel O'Boyle said...

Karol, thanks for the link. I had this problem myself from time to time when reading with Sage (Firefox plugin). I think I tried to find a solution but failed - I may try again. It's something to do with the content of the emails send to CCL particularly when they show up as binary files in the archives (about 1 in 200 emails are like this).

Karol said...

For the record, this feed now validates!

Vladimir Chupakhin said...

Thanks a lot!

waiting for Amber, modeller and other molecular modeling mail lists :)

Noel O'Boyle said...

Regular mailing lists are easily converted to RSS feeds using Nabble.