#!/usr/bin/python # Copyright (C) 2003-2008, Christof Meerwald # http://cmeerw.org # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 dated June, 1991. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA import apsw, getopt, string, sys, time, zlib import xml.sax, xml.sax.handler CONF_subcat = ['Top'] CONF_verbose = 0 stat_links = 0 stat_newcats = 0 stat_updatecats = 0 time_start = time.time() catid_counter = 1073741824 _catid_cache = {} _catid_buckets = [{}, {}] _catid_bucket = 0 def get_catid(cursor, topic): global _catid_bucket if len(topic) == 0: return 0 elif len(topic) == 1: return 1 elif topic[-1] == '': return get_catid(cursor, topic[:-1]) # cache lookup (1st and 2nd level) try: if len(topic) == 2: return _catid_cache[(topic[1],)] elif len(topic) == 3: return _catid_cache[(topic[1], topic[2])] except KeyError: pass if len(topic) > 3: for bucket in _catid_buckets: try: return bucket[tuple(topic)] except KeyError: pass parent_id = get_catid(cursor, topic[:-1]) catid = None for row in cursor.execute('SELECT id FROM category where pid=? AND name=?', (parent_id, topic[-1])): catid = row[0] if catid == None: return -1 # update cache if len(topic) == 2: _catid_cache[(topic[1],)] = catid elif len(topic) == 3: _catid_cache[(topic[1], topic[2])] = catid else: bucket = _catid_buckets[_catid_bucket] if len(bucket) >= 256: _catid_bucket = (_catid_bucket + 1) % len(_catid_buckets) _catid_buckets[_catid_bucket] = {} bucket = _catid_buckets[_catid_bucket] bucket[tuple(topic)] = catid return catid def get_catname(db, id): if id == 0 or id == 1: return [] else: cursor = db.cursor() for name, parent_id in cursor.execute('SELECT name, pid FROM category where id=?', (id,)): fname = get_catname(db, parent_id) fname.append(name) return fname raise KeyError(id) def simplify(s): s = string.lower(s) return s def pagesorter(p1, p2): if p1[0] == p2[0]: if p1[1] == p2[1]: return cmp(p1[2], p2[2]) elif p1[1] == '': return -1 elif p2[1] == '': return 1 else: return -cmp(p1[1], p2[1]) else: return -cmp(p1[0], p2[0]) class ContentHandler(xml.sax.handler.ContentHandler): def __init__(self, db): self._db = db self._category_priority = {} self._context = [] self._catid, self._topic, self._url = None, None, None self._title, self._description = None, None self._priority, self._mediadate, self._type = '', '', '' self._links = {} self._pages = None self._lastupdate, self._editors, self._newsgroups = None, None, None self._related, self._symbolic = None, None self._handlers = { ('http://dmoz.org/rdf', 'Topic') : (self.topic_start, self.topic_end), ('http://dmoz.org/rdf', 'catid') : (self.catid_start, self.catid_end), ('http://dmoz.org/rdf', 'editor') : (self.editor_start, self.editor_end), ('http://dmoz.org/rdf', 'newsGroup') : (self.newsgroup_start, self.newsgroup_end), ('http://dmoz.org/rdf', 'link') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'link1') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'link2') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'atom') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'atom1') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'atom2') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'rss') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'rss1') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'rss2') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'pdf') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'pdf1') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'pdf2') : (self.link_start, self.link_end), ('http://dmoz.org/rdf', 'related') : (self.related_start, self.related_end), ('http://dmoz.org/rdf', 'symbolic') : (self.symbolic_start, self.symbolic_end), ('http://dmoz.org/rdf', 'symbolic1') : (self.symbolic1_start, self.symbolic_end), ('http://dmoz.org/rdf', 'symbolic2') : (self.symbolic2_start, self.symbolic_end), ('http://dmoz.org/rdf', 'narrow') : (self.narrow_start, self.narrow_end), ('http://dmoz.org/rdf', 'narrow1') : (self.narrow1_start, self.narrow_end), ('http://dmoz.org/rdf', 'narrow2') : (self.narrow2_start, self.narrow_end), ('http://dmoz.org/rdf', 'lastUpdate') : (self.lastupdate_start, self.lastupdate_end), ('http://dmoz.org/rdf', 'ExternalPage') : (self.externalpage_start, self.externalpage_end), ('http://dmoz.org/rdf', 'priority') : (self.priority_start, self.priority_end), ('http://dmoz.org/rdf', 'type') : (self.type_start, self.type_end), ('http://dmoz.org/rdf', 'mediadate') : (self.mediadate_start, self.mediadate_end), ('http://purl.org/dc/elements/1.0/', 'Title') : (self.title_start, self.title_end), ('http://purl.org/dc/elements/1.0/', 'Description') : (self.description_start, self.description_end), } def _flush(self): global stat_links if self._pages and self._catid: cursor = self._db.cursor() cursor.execute('BEGIN TRANSACTION') if False: for priority, mediadate, skey, url, title, descr, ftype in self._pages: if not mediadate: mediadate = None if not ftype: ftype = None cursor.execute('INSERT INTO page (cid, priority, mediadate, type, url, title, descr) VALUES (?, ?, ?, ?, ?, ?, ?)', (self._catid, priority, mediadate, ftype, url, title, descr)) else: self._pages.sort(pagesorter) data = [] for priority, mediadate, skey, url, title, descr, ftype in self._pages: if not mediadate: mediadate = '' if not ftype: ftype = 0 data.append('\0'.join((chr(0x40 + priority + (ftype << 3)) + mediadate.encode('ascii'), url.encode('utf8'), title.encode('utf8'), descr.encode('utf8')))) cdata = zlib.compress('\n'.join(data)) cursor.execute('INSERT OR REPLACE INTO pagelist (cid, data) VALUES (?, ?)', (self._catid, buffer(cdata))) cursor.execute('COMMIT') stat_links += len(self._pages) self._pages = None def _update_category_priority(self, topic, priority): cursor = self._db.cursor() catid = get_catid(cursor, topic) if catid == -1: self._category_priority[tuple(topic)] = priority else: cursor.execute('UPDATE category SET priority=? WHERE id=?', (priority, catid)) def startDocument(self): pass def endDocument(self): self._flush() def processingInstruction(self, target, data): pass def startElement(self, name, attrs): print name, attrs.keys() self.startElementNS(name, name, attrs) def startElementNS(self, name, qname, attrs): try: handler = self._handlers[name] handler[0](attrs) except KeyError: handler = (None, None) self._context.append(name) def endElement(self, name): self.endElementNS(name, name) def endElementNS(self, name, qname): del self._context[-1] try: handler = self._handlers[name] handler[1]() except KeyError: handler = (None, None) def characters(self, data): if self._context: if self._context[-1] == ('http://dmoz.org/rdf', 'catid'): self._catid += data elif len(self._context) >= 2 and self._context[-2] == ('http://dmoz.org/rdf', 'ExternalPage'): if self._context[-1] == ('http://purl.org/dc/elements/1.0/', 'Title'): self._title += data elif self._context[-1] == ('http://purl.org/dc/elements/1.0/', 'Description'): self._description += data elif self._context[-1] == ('http://dmoz.org/rdf', 'priority'): self._priority += data elif self._context[-1] == ('http://dmoz.org/rdf', 'mediadate'): self._mediadate += data elif self._context[-1] == ('http://dmoz.org/rdf', 'type'): self._type += data elif len(self._context) >= 2 and self._context[-2] == ('http://dmoz.org/rdf', 'Topic'): if self._context[-1] == ('http://purl.org/dc/elements/1.0/', 'Description'): self._description += data elif self._context[-1] == ('http://dmoz.org/rdf', 'lastUpdate'): self._lastupdate += data def skippedEntity(self, name): print 'skippedEntity', name def topic_start(self, attrs): self._flush() self._links = {} self._catid = None self._description = '' self._lastupdate, self._editors, self._newsgroups = '', [], [] self._related, self._symbolic = [], [] try: self._topic = string.split(string.replace(attrs[('http://www.w3.org/TR/RDF/', 'id')], '_', ' '), '/') if CONF_verbose and len(self._topic) <= 4: print 'processing', string.join(self._topic, '/').encode('iso8859-1', 'replace') if CONF_verbose >= 2: print '%.1f categories per second' % ((stat_newcats + stat_updatecats) / (time.time() - time_start),) except KeyError: self._topic = None def topic_end(self): global catid_counter global stat_newcats, stat_updatecats catid_counter += 1 self._pages = [] # support for dump without catid's if not self._catid: if self._topic == ['Top']: self._catid = 1 else: self._catid = catid_counter if (len(self._topic) >= len(CONF_subcat) and self._topic[:len(CONF_subcat)] != CONF_subcat) or (len(self._topic) < len(CONF_subcat) and CONF_subcat[:len(self._topic)] != self._topic): self._catid = None if self._catid and self._topic: cursor = self._db.cursor() cursor.execute('BEGIN TRANSACTION') parent_id = get_catid(cursor, self._topic[:-1]) row = None for row in cursor.execute('SELECT id FROM category WHERE pid=? AND name=?', (parent_id, self._topic[-1])): self._catid = row[0] stat_updatecats += 1 if row == None: stat_newcats += 1 lastupdate = None if self._lastupdate: try: lastupdate = time.mktime((string.atoi(self._lastupdate[0:4]), string.atoi(self._lastupdate[5:7]), string.atoi(self._lastupdate[8:10]), string.atoi(self._lastupdate[11:13]), string.atoi(self._lastupdate[14:16]), string.atoi(self._lastupdate[17:19]), 0, 0, 0)) - time.timezone except ValueError: pass if self._description: cdescr = buffer(zlib.compress(self._description.encode('utf8'))) else: cdescr = None try: priority = self._category_priority[tuple(self._topic)] except KeyError: priority = 0 if row == None: cursor.execute('INSERT INTO category (id, pid, name, priority, lastupdate, descr) VALUES (?, ?, ?, ?, ?, ?)', (self._catid, parent_id, self._topic[-1], priority, lastupdate, cdescr)) else: update_attrs, update_values = [], [] if priority != 0: update_attrs.append('priority') update_values.append(priority) if lastupdate != None: update_attrs.append('lastupdate') update_values.append(lastupdate) if cdescr != None: update_attrs.append('descr') update_values.append(cdescr) if update_attrs: update_attrs = map(lambda x: x + '=?', update_attrs) cursor.execute('UPDATE category SET %s WHERE id=?' % (', '.join(update_attrs)), tuple(update_values + [self._catid])) for editor in self._editors: cursor.execute('INSERT OR IGNORE INTO category_editor (cid, editor) VALUES (?, ?)', (self._catid, editor)) for newsgroup in self._newsgroups: cursor.execute('INSERT OR IGNORE INTO category_newsgroup (cid, newsgroup) VALUES (?, ?)', (self._catid, newsgroup)) for related in self._related: rid = get_catid(cursor, string.split(related, '/')) if rid != -1: cursor.execute('INSERT OR IGNORE INTO related (cid, rid) VALUES (?, ?)', (self._catid, rid)) for sym in self._symbolic: lid = get_catid(cursor, string.split(sym[2], '/')) if lid != -1: cursor.execute('INSERT OR IGNORE INTO link (cid, lid, priority, name) VALUES (?, ?, ?, ?)', (self._catid, lid, sym[0], sym[1])) cursor.execute('COMMIT') try: del self._category_priority[tuple(self._topic)] except KeyError: pass self._description = None self._lastupdate = None self._editors = None self._newsgroups = None self._related = None self._symbolic = None def catid_start(self, attrs): self._catid = '' def catid_end(self): self._catid = string.atoi(self._catid) def link_start(self, attrs): try: self._links[attrs[('http://www.w3.org/TR/RDF/', 'resource')]] = None except KeyError: pass def link_end(self): pass def lastupdate_start(self, attrs): pass def lastupdate_end(self): pass def priority_start(self, attrs): pass def priority_end(self): pass def mediadate_start(self, attrs): pass def mediadate_end(self): pass def type_start(self, attrs): pass def type_end(self): pass def editor_start(self, attrs): try: self._editors.append(attrs[('http://www.w3.org/TR/RDF/', 'resource')]) except KeyError: pass def editor_end(self): pass def newsgroup_start(self, attrs): try: group = attrs[('http://www.w3.org/TR/RDF/', 'resource')] if group[:5] == 'news:': group = group[5:] self._newsgroups.append(group) except KeyError: pass def newsgroup_end(self): pass def related_start(self, attrs): try: self._related.append(string.replace(attrs[('http://www.w3.org/TR/RDF/', 'resource')], '_', ' ')) except KeyError: pass def related_end(self): pass def symbolic_start(self, attrs): try: name, res = string.split(string.replace(attrs[('http://www.w3.org/TR/RDF/', 'resource')], '_', ' '), ':', 1) self._symbolic.append((0, name, res)) except KeyError: pass def symbolic1_start(self, attrs): try: name, res = string.split(string.replace(attrs[('http://www.w3.org/TR/RDF/', 'resource')], '_', ' '), ':', 1) self._symbolic.append((1, name, res)) except KeyError: pass def symbolic2_start(self, attrs): try: name, res = string.split(string.replace(attrs[('http://www.w3.org/TR/RDF/', 'resource')], '_', ' '), ':', 1) self._symbolic.append((2, name, res)) except KeyError: pass def symbolic_end(self): pass def narrow_start(self, attrs): pass def narrow1_start(self, attrs): try: res = string.replace(attrs[('http://www.w3.org/TR/RDF/', 'resource')], '_', ' ') self._update_category_priority(res.split('/'), 1) except KeyError: pass def narrow2_start(self, attrs): try: res = string.replace(attrs[('http://www.w3.org/TR/RDF/', 'resource')], '_', ' ') self._update_category_priority(res.split('/'), 2) except KeyError: pass def narrow_end(self): pass def externalpage_start(self, attrs): self._title, self._description = '', '' self._priority, self._mediadate, self._type = '', '', '' try: self._url = attrs[(None, 'about')].encode('ascii', 'replace') except KeyError: self._url = None def externalpage_end(self): if self._catid and self._url and self._title: self._title = self._title.replace('\n', ' ').replace('\r', ' ') self._description = self._description.replace('\n', ' ').replace('\r', ' ') self._priority = self._priority.strip().lower() if self._priority: self._priority = string.atoi(self._priority) else: self._priority = 0 self._mediadate = self._mediadate.strip().lower() if self._mediadate[-3:] == '.00': self._mediadate = self._mediadate[:-3] self._type = self._type.strip().lower() if self._type == 'pdf': self._type = 1 elif self._type == 'rss': self._type = 2 elif self._type == 'atom': self._type = 3 else: self._type = 0 if self._links.has_key(self._url): self._pages.append((self._priority, self._mediadate, simplify(self._title), self._url, self._title, self._description, self._type)) del self._links[self._url] elif CONF_verbose: print 'unreferenced link %s (%s) in %s' % (self._url, self._title.encode('iso8859-1', 'replace'), string.join(self._topic, '/').encode('iso8859-1', 'replace')) self._url, self._title, self._description = None, None, None self._priority, self._mediadate, self._type = None, None, None def title_start(self, attrs): pass def title_end(self): pass def description_start(self, attrs): pass def description_end(self): pass optlist, args = getopt.getopt(sys.argv[1:], 's:v', ['subcat=', 'verbose']) for opt, arg in optlist: if (opt == '-s') or (opt == '--subcat'): CONF_subcat = ['Top'] + string.split(arg, '/') elif (opt == '-v') or (opt == '--verbose'): CONF_verbose += 1 db = apsw.Connection('odp.db') # no need for synchronous updates when creating the database db.cursor().execute('PRAGMA synchronous=OFF') try: try: l = sys.stdin.readline() if l: content_parser = xml.sax.make_parser() content_parser.setFeature(xml.sax.handler.feature_namespaces, True) content_parser.setFeature(xml.sax.handler.feature_namespace_prefixes, True) content_handler = ContentHandler(db) content_parser.setContentHandler(content_handler) while l: # sanitise broken UTF-8 encoded input l = l.decode('utf-8', 'replace').encode('utf-8') content_parser.feed(l) l = sys.stdin.readline() content_parser.close() except KeyboardInterrupt: print 'aborted' finally: if CONF_verbose: print 'created %d new categories, updated %d categories' % (stat_newcats, stat_updatecats) print 'processed %d links' % (stat_links) if CONF_verbose >= 2: print '%.1f categories per second' % ((stat_newcats + stat_updatecats) / (time.time() - time_start),)