Thursday 12 September 2024

Do NOT use SDFdb

It's not even a real database. All it does is:

import sdfdb
db = sdfdb.SDFdb("mysdf.sdf")
molfile = db.get_mol("mytitle")

Ok, so it's quick to index even quite a large SD file. But if Andrew Dalke ever sees the corners it cuts...! I mean, it doesn't even support Windows line endings.

And the code? Well, it speaks for itself:

import re

class SDFdb:
    def __init__(self, fname):
        self.fname = fname
        self.file = open(self.fname, "rb")
        self._create_index()

    def _create_index(self):
        patt = re.compile(b"\$\$\$\$\n(.*)\n")
        chunksize = 100000
        self.lookup = {}
        self.start = []
        idx = 0
        position = 0
        title = self.file.readline().rstrip().decode("ascii")
        self.lookup[title] = idx
        idx += 1
        self.start.append(position)
        position = self.file.tell()
        while text := self.file.read(chunksize):
            if text[-1] != '\n':
                text += self.file.readline()
            # Invariant: text ends with "\n"
            if text.endswith(b"\n$$$$\n"):
                text += self.file.readline()
            # Invariant: text never ends with "\n$$$$\n"
            for m in patt.finditer(text):
                title = m.groups()[0].decode("ascii")
                if title in self.lookup:
                    print(f"WARNING: Duplicate title {title}")
                self.lookup[title] = idx
                offset = m.start() + 5
                idx += 1
                self.start.append(position + offset)
            position = self.file.tell()
        self.start.append(self.file.tell()) # should be EOF

    def get_mol(self, title):
        idx = self.lookup.get(title, None)
        if idx is None:
            raise KeyError(f"Title '{title}' not found")
        self.file.seek(self.start[idx])
        text = self.file.read(self.start[idx+1]-self.start[idx])
        return text.decode("ascii")

    def close(self):
        self.file.close()

...though maybe I can see it would be useful for a large number of random accesses into a large (well-behaved) SD file.

That is, if they weren't already in a database. (Or *cough* they were in a database but you thought this was simpler than writing the code to batch up queries.)

No comments: