Monday, 12 May 2008

RSS feeds for chemistry projects on SourceForge

Wouldn't it be nice to know whether any of your favourite chemistry projects has released a new version? Or to keep abreast of the latest registrations of chemistry projects on SourceForge? No? I guess it's just me then.

Anyway, here are some RSS feeds I've thrown together to allow me to do just that:
  • latest chemistry releases
  • registrations of chemistry projects: latest and two months old

Note: the two-months-old RSS feed is better if you want to find some actual code or a working website when you click through.

Want to do the same for another software category? Here's the code (requires BeautifulSoup and PyRSS2Gen):

import datetime
import urllib

from BeautifulSoup import BeautifulSoup
import PyRSS2Gen

def download():
urls = ["http://sourceforge.net/search/index.php?words=trove%3A%28384%29"
"&sort=latest_file_date&sortdir=desc&offset=0&limit=100&"
"type_of_search=soft&pmode=0",
"http://sourceforge.net/search/index.php?words=trove%3A%28384%29"
"&sort=registration_date&sortdir=desc&offset=0&limit=100&"
"type_of_search=soft&pmode=0"]
urllib.urlretrieve(urls[0], "releases.html")
urllib.urlretrieve(urls[1], "registrations.html")

def converttodate(text):
if text=="(none)":
return None
t = map(int, text.split("-"))

return datetime.datetime(*t)

def makerss(filename, items, sortby, title):
rss = PyRSS2Gen.RSS2(
title = title,
link = "http://baoilleach.blogspot.com/2008/05/rss-feeds-for-chemistry-projects-on.html",
description = "baoilleach's RSS feed of "
"Chemistry projects on SourceForge",

lastBuildDate = datetime.datetime.now(),

items = [
PyRSS2Gen.RSSItem(
title = item["title"],
link = item["link"],
description = item["description"],
guid = PyRSS2Gen.Guid("%s %s" % (item["title"], item['lastrelease'])),
pubDate = item[sortby])
for item in items]
)

rss.write_xml(open(filename, "w"))

def analyse(project):
ans = {}
ans['title'] = project.a.string
ans['link'] = "http://sf.net" + project.a['href']

data = project.parent.parent
ans['lastrelease'] = converttodate(data('td')[5].string.strip())

ans['registered'] = converttodate(data('td')[4].string.strip())

data = project.parent.parent.findNextSibling()
ans['description'] = data.td.contents[0].strip()
if not ans['description']:
ans['description'] = data.td.contents[2].strip()

return ans

def processfile(filename):
html = open(filename, "r").read()
soup = BeautifulSoup(html)
projects = soup.findAll(lambda tag: tag.name=="h3" and tag.a
and tag.a['href'].startswith("/projects/"))

data = [analyse(project) for project in projects]
return data

if __name__=="__main__":
download()

data = processfile("registrations.html")
sometimeago = datetime.datetime.now() - datetime.timedelta(days=60)
olddata = [d for d in data if d['registered'] <= sometimeago]
makerss("oldregistrations.rss", olddata, "registered", "Registrations 60 days ago on SF")
makerss("newregistrations.rss", data, "registered", "Latest registrations on SF")

data = processfile("releases.html")
makerss("latestreleases.rss", data, "lastrelease", "Latest releases on SF")

2 comments:

blog said...

This is a great idea, thanks! Now we know how you stay on top of new chemistry projects.

Andrew Dalke said...

PyRSS2Gen? I'm feeling the love. :)