Noel O'Blog: RSS feeds for chemistry projects on SourceForge

Monday, 12 May 2008

RSS feeds for chemistry projects on SourceForge

Wouldn't it be nice to know whether any of your favourite chemistry projects has released a new version? Or to keep abreast of the latest registrations of chemistry projects on SourceForge? No? I guess it's just me then.

Anyway, here are some RSS feeds I've thrown together to allow me to do just that:

latest chemistry releases
registrations of chemistry projects: latest and two months old

Note: the two-months-old RSS feed is better if you want to find some actual code or a working website when you click through.

Want to do the same for another software category? Here's the code (requires BeautifulSoup and PyRSS2Gen):


import datetime
import urllib

from BeautifulSoup import BeautifulSoup
import PyRSS2Gen

def download():
    urls = ["http://sourceforge.net/search/index.php?words=trove%3A%28384%29"
            "&sort=latest_file_date&sortdir=desc&offset=0&limit=100&"
            "type_of_search=soft&pmode=0",
            "http://sourceforge.net/search/index.php?words=trove%3A%28384%29"
            "&sort=registration_date&sortdir=desc&offset=0&limit=100&"
            "type_of_search=soft&pmode=0"]
    urllib.urlretrieve(urls[0], "releases.html")
    urllib.urlretrieve(urls[1], "registrations.html")

def converttodate(text):
    if text=="(none)":
        return None
    t = map(int, text.split("-"))

    return datetime.datetime(*t)

def makerss(filename, items, sortby, title):
    rss = PyRSS2Gen.RSS2(
        title = title,
        link = "http://baoilleach.blogspot.com/2008/05/rss-feeds-for-chemistry-projects-on.html",
        description = "baoilleach's RSS feed of "
                      "Chemistry projects on SourceForge",

        lastBuildDate = datetime.datetime.now(),

        items = [
           PyRSS2Gen.RSSItem(
             title = item["title"],
             link = item["link"],
             description = item["description"],
             guid = PyRSS2Gen.Guid("%s %s" % (item["title"], item['lastrelease'])),
             pubDate = item[sortby])
           for item in items]
           )

    rss.write_xml(open(filename, "w"))

def analyse(project):
    ans = {}
    ans['title'] = project.a.string
    ans['link'] = "http://sf.net" + project.a['href']

    data = project.parent.parent
    ans['lastrelease']  = converttodate(data('td')[5].string.strip())

    ans['registered'] = converttodate(data('td')[4].string.strip())

    data = project.parent.parent.findNextSibling()
    ans['description'] = data.td.contents[0].strip()
    if not ans['description']:
        ans['description'] = data.td.contents[2].strip()

    return ans

def processfile(filename):
    html = open(filename, "r").read()
    soup = BeautifulSoup(html)
    projects = soup.findAll(lambda tag: tag.name=="h3" and tag.a
                            and tag.a['href'].startswith("/projects/"))

    data = [analyse(project) for project in projects]
    return data

if __name__=="__main__":
    download()

    data = processfile("registrations.html")
    sometimeago = datetime.datetime.now() - datetime.timedelta(days=60)
    olddata = [d for d in data if d['registered'] <= sometimeago]
    makerss("oldregistrations.rss", olddata, "registered", "Registrations 60 days ago on SF")
    makerss("newregistrations.rss", data, "registered", "Latest registrations on SF")

    data = processfile("releases.html")
    makerss("latestreleases.rss", data, "lastrelease", "Latest releases on SF")

2 comments:

Anonymous said...: This is a great idea, thanks! Now we know how you stay on top of new chemistry projects.; 12 May 2008 at 15:15
Andrew Dalke said...: PyRSS2Gen? I'm feeling the love. :); 19 May 2008 at 11:58