It's not even a real database. All it does is:
import sdfdb db = sdfdb.SDFdb("mysdf.sdf") molfile = db.get_mol("mytitle")
Ok, so it's quick to index even quite a large SD file. But if Andrew Dalke ever sees the corners it cuts...! I mean, it doesn't even support Windows line endings.
And the code? Well, it speaks for itself:
import re class SDFdb: def __init__(self, fname): self.fname = fname self.file = open(self.fname, "rb") self._create_index() def _create_index(self): patt = re.compile(b"\$\$\$\$\n(.*)\n") chunksize = 100000 self.lookup = {} self.start = [] idx = 0 position = 0 title = self.file.readline().rstrip().decode("ascii") self.lookup[title] = idx idx += 1 self.start.append(position) position = self.file.tell() while text := self.file.read(chunksize): if text[-1] != '\n': text += self.file.readline() # Invariant: text ends with "\n" if text.endswith(b"\n$$$$\n"): text += self.file.readline() # Invariant: text never ends with "\n$$$$\n" for m in patt.finditer(text): title = m.groups()[0].decode("ascii") if title in self.lookup: print(f"WARNING: Duplicate title {title}") self.lookup[title] = idx offset = m.start() + 5 idx += 1 self.start.append(position + offset) position = self.file.tell() self.start.append(self.file.tell()) # should be EOF def get_mol(self, title): idx = self.lookup.get(title, None) if idx is None: raise KeyError(f"Title '{title}' not found") self.file.seek(self.start[idx]) text = self.file.read(self.start[idx+1]-self.start[idx]) return text.decode("ascii") def close(self): self.file.close()
...though maybe I can see it would be useful for a large number of random accesses into a large (well-behaved) SD file.
That is, if they weren't already in a database. (Or *cough* they were in a database but you thought this was simpler than writing the code to batch up queries.)