# -*- coding: utf-8 # Copyright 2004-2006 by Vahur Rebas import re import logging import Globals from Globals import Acquisition, Persistent from AccessControl import ClassSecurityInfo from AccessControl import getSecurityManager from Acquisition import aq_base, aq_inner, aq_parent, Implicit from OFS.SimpleItem import SimpleItem from OFS.PropertyManager import PropertyManager from zope.interface import implements import random import xml.dom.minidom from xml.dom.minidom import Document from xml.dom import Node #from xml.dom.ext import PrettyPrint #from Products.ZCatalog.CatalogAwareness import CatalogAware from Products.ZCatalog.CatalogPathAwareness import CatalogAware from textindexng.interfaces import IIndexableContent from textindexng.interfaces import IConverter, ISplitter from textindexng.converters import html, sgml, ooffice, pdf from textindexng.converters.entities import convert_entities import types from interfaces import IDocument from permissions import * from schemas import get_schema from utils import invokeScript logger = logging.getLogger('Korpus') class Document(Persistent,SimpleItem,CatalogAware, PropertyManager): """ One document """ implements(IDocument, IIndexableContent) meta_type = 'Document' security = ClassSecurityInfo() #security.declareObjectProtected(perm_view,) security.declareObjectPublic() #XXX: this doesn't work! manage_options = PropertyManager.manage_options+SimpleItem.manage_options #manage_options=( # {'label':'View', # 'action':'document_view'}, # )+SimpleItem.manage_options def __init__(self, _id, _doc, _schema): """ init """ self.id = _id self.fulldoc = _doc self.default_catalog = 'zcatalog' self.org_fulldoc = '' # original full document. without marked text self.org_title = '' # original title getOrgTitle self.org_description = '' # original description getOrgDescription self.org_body = '' # original body getOrgBody self._setProperty('document_status', 0, 'int') # getStatus self._setProperty('is_deleted', 0, 'boolean') self.title = '' # getTitle self.description = '' # getDescription self.body = '' # getBody. self._setProperty('textdoc', '', 'text') self._setProperty('words', 0, 'int') self._setProperty('sentences', 0, 'int') self._setProperty('lastModifier', '', 'string') # document schema self._document_schema = _schema self._corpus = '' self._dsstore = {} self.textcontent = "" self._tmp_doc = True def __getattr__(self, name): if name in get_schema(self.getMetaSchema()).getFieldNames(): return get_schema(self.getMetaSchema()).getValue(name, self) raise AttributeError security.declarePrivate('manage_afterAdd') def manage_afterAdd(self,obj,container): """ ... """ print "manage_afterAdd" self.title = self.extractTitle() self.description = self.extractDescription() self.body = self.extractBody() self._updateProperty('textdoc', self.toText()) #self.extractErrors() self.countWords() self.index_object() self._updateDocStatistics() self._storeDocInfos() #self.setGlobalUsedTongues(self.getTongue()) #self.setGlobalUsedNations(self.getNation()) security.declarePrivate('manage_beforeDelete') def manage_beforeDelete(self, item, container): """ unindex object before deletion """ self.unindex_object() security.declareProtected(perm_view_document, 'getId') def getId(self): """ return id """ return self.id security.declareProtected(perm_view_document, 'getTitle') def getTitle(self): """ return title """ return self.title def getMetaSchema(self): """ return schema """ return self._document_schema def getCorpus(self): " return corpus " k=getattr(self, '_corpus', 'cFOoRQekA') if k=='': return 'cFOoRQekA' if k=='main': return 'cFOoRQekA' if k is None: return 'cFOoRQekA' return k #return getattr(self, '_corpus', '') security.declareProtected(perm_manage, 'setCorpus') def setCorpus(self, cid): """ set a corpus id eg. assign a document to korpus """ self._corpus = cid security.declareProtected(perm_view_document, 'getDescription') def getDescription(self): """ return description """ return self.description security.declareProtected(perm_view_document, 'transformToView') def transformToView(self, REQUEST=None): """ return body """ body = self.getBody() if REQUEST: sw=REQUEST.get('searchword', None) if sw: p=re.compile("(\\b"+sw+"\\b)", re.I) body=p.sub(r"\1", body) # body=body.replace(' '+sw+' ', " "+sw+" ") # regex=re.compile(r"(* )"+sw+"([\\.,?!]*)", re.MULTILINE) # body=regex.sub("\\1"+sw+"\\2") return body def isDeleted(self): return self.getProperty('is_deleted') security.declareProtected(perm_del_documents, 'deleteMe') def deleteMe(self): self._updateProperty('is_deleted', 1) self.sqls.docDeleted(docid=self.getId()) security.declareProtected(perm_view_document, 'getBody') def getBody(self): """ return body """ return self.body security.declareProtected(perm_view_document, 'indexableContent') def indexableContent(self, fields): from textindexng.content import IndexContentCollector as ICC icc = ICC() doc = unicode(self.fulldoc, 'utf-8') icc.addBinary('getDocument', doc, 'text/html') #icc.addContent('getMarkedWords', unicode(self.getMarkedWords(), 'utf-8')) marked = self.getMarkedWords() if type('') == type(marked): marked = unicode(marked, 'utf-8') icc.addContent('getMarkedWords', marked) icc.addContent('getTitle', unicode(self.getTitle(), 'utf-8')) return icc security.declareProtected(perm_view_document, 'getDocument') def getDocument(self): """ doc """ self.REQUEST.RESPONSE.setHeader("Content-type","text/html; charset=UTF-8") return self.fulldoc def getVisibleText(self): return self.textcontent security.declareProtected(perm_view_document, 'getStatus') def getStatus(self): """ return document status 0 - document is in editing mode 1 - document is in marking mode """ return self.getProperty('document_status') security.declareProtected(perm_view_document, 'getSubmitter') def getSubmitter(self): """ get the username who created document """ return self.getOwnerTuple()[1] security.declareProtected(perm_view_document, 'getLastModder') def getLastModder(self): """ last modifier of the document """ try: return self.lastModifier except AttributeError: return -1 security.declareProtected(perm_change_status, 'start_modding') def start_modding(self, REQUEST): """ start modding document we should copy fulldoc, body, title, desc etc to orginial document holding attributes """ if not self.getStatus(): self._updateProperty('document_status', 1) self.sqls.docStatus(docid=self.getId(), status=1) import copy self.org_fulldoc = copy.deepcopy(self.fulldoc) self.org_title = copy.deepcopy(self.title) self.org_description = copy.deepcopy(self.description) self.org_body = copy.deepcopy(self.body) self.reindex_object() return REQUEST.RESPONSE.redirect('mark.html') security.declareProtected(perm_edit_document, 'extractTitle') def extractTitle(self): """ exract title from Document """ doc = xml.dom.minidom.parseString(self.getDocument()) title = doc.getElementsByTagName('title')[0] return get_text(title) security.declareProtected(perm_edit_document, 'extractDescription') def extractDescription(self): """ extract description from Document """ doc = xml.dom.minidom.parseString(self.getDocument()) desc = doc.getElementsByTagName('meta') for x in desc: if x.getAttribute('name') == 'description': return x.getAttribute('content') return "" security.declareProtected(perm_edit_document, 'extractBody') def extractBody(self): """ extract head from document """ doc = xml.dom.minidom.parseString(self.getDocument()) bodytag = doc.getElementsByTagName('body')[0] stri = bodytag.toxml() #print stri b = re.compile('()|()', re.I) stri = stri.encode('utf-8') res = b.sub('', stri) print print "RES:" print res res = unicode(res, 'utf-8').encode('utf-8') return res security.declareProtected(perm_edit_document, 'saveDocument') def saveDocument(self,REQUEST): """ save modified document """ self.fulldoc = REQUEST.kupu self.manage_afterAdd(self,self.aq_parent) self.sqls.morfoDelete(docid=self.getId()) self._tmp_doc = False try: self._updateProperty('lastModifier', str(REQUEST.AUTHENTICATED_USER)) except: self._setProperty('lastModifier', str(REQUEST.AUTHENTICATED_USER), 'string') return REQUEST.RESPONSE.redirect(self.absolute_url()) def saveInformat(self, REQUEST): """ save metadata """ get_schema(self.getMetaSchema()).update(self, REQUEST) self._p_changed = True self.reindex_object() return security.declareProtected(perm_view_document, 'getMarks') def getMarks(self, REQUEST): """ return marks. for marking page """ return self.Errors.getDocumentMarksHTML(self.getId(), str(REQUEST.AUTHENTICATED_USER)) security.declareProtected('do_not_touch_me', 'saveMarksNG_rescue') def saveMarksNG_rescue(self, REQUEST): """ saving mark pointers """ count = 0 from xml import xpath # determine how many pointer we have coun = 1000 for x in range(1,coun): pair = REQUEST.get('range_'+str(x), None) if not pair: continue pair = eval(pair) print "-->", x, pair r = pair[0] pointer = pair[0] try: code = pair[1] except IndexError: print "index error 1", pair continue existing = None try: existing = pair[2] except IndexError: pass if existing: if existing == 'DELETEME': continue if 'DELETEME' in pair: self.Errors.deleteError(existing) print "existing...", pair continue r_st, r_en = r.split(';') doc = xml.dom.minidom.parseString(self.getDocument()) start, st_offset = r_st.split('#') st_offset = st_offset.split(':')[1] end, en_offset = r_en.split('#') en_offset = en_offset.split(':')[1] body = doc.getElementsByTagName('body')[0] start = start.lower() end = end.lower() if start.startswith('/'): start = start[1:] if end.startswith('/'): end = end[1:] print "....", x, start start_node = xpath.Evaluate(start, body)[0] # always take the end_node = xpath.Evaluate(end, body)[0] # first node we get if start_node.isSameNode(end_node): content = start_node.nodeValue[int(st_offset):int(en_offset)] else: content = self.extractMarkedContent(start_node, end_node, int(st_offset), int(en_offset)) author = REQUEST.get('username') if type(content) == types.UnicodeType: content = content.encode('utf-8') err = self.Errors.addNewError(pointer, content, self.getId(), code, author) pre, post = self.extractContext(body, start_node, end_node, int(st_offset), int(en_offset), start=1) pre.reverse() err.addPreContext(pre) err.addPostContext(post) self.reindex_object() return REQUEST.RESPONSE.redirect(self.absolute_url()) security.declareProtected(perm_mark_document, 'saveMarksNG') def saveMarksNG(self, REQUEST): """ saving mark pointers """ #logger.info("Harry katsetus") #logger.info(REQUEST) count = 0 from xml import xpath # determine how many pointer we have coun = 1000 for x in range(1,coun): pair = REQUEST.get('range_'+str(x), None) if not pair: continue r = pair[0] pointer = pair[0] try: code = pair[1] except IndexError: continue existing = None try: existing = pair[2] except IndexError: pass if existing: if existing == 'DELETEME': continue if 'DELETEME' in pair: self.Errors.deleteError(existing) continue r_st, r_en = r.split(';') doc = xml.dom.minidom.parseString(self.getDocument()) start, st_offset = r_st.split('#') st_offset = st_offset.split(':')[1] end, en_offset = r_en.split('#') en_offset = en_offset.split(':')[1] body = doc.getElementsByTagName('body')[0] start = start.lower() end = end.lower() if start.startswith('/'): start = start[1:] if end.startswith('/'): end = end[1:] try: start_node = xpath.Evaluate(start, body)[0] # always take the except IndexError, ie: # # this is causing bug! # start == span/p[8]/span/text() # should be p[8]/span/text() raise ie end_node = xpath.Evaluate(end, body)[0] # first node we get if start_node.isSameNode(end_node): content = start_node.nodeValue[int(st_offset):int(en_offset)] else: content = self.extractMarkedContent(start_node, end_node, int(st_offset), int(en_offset)) author = str(REQUEST.AUTHENTICATED_USER) if type(content) == types.UnicodeType: content = content.encode('utf-8') #logger.info("Harry katsetused-8") #logger.info(body) #logger.info(start_node) #logger.info(end_node) #logger.info(int(st_offset)) #logger.info(int(en_offset)) #logger.info(start) pre, post = self.extractContext(body, start_node, end_node, int(st_offset), int(en_offset), start=1) pre.reverse() err = self.Errors.addNewError(pointer, content, self.getId(), code, author, pre, post) self.reindex_object() return REQUEST.RESPONSE.redirect(self.absolute_url()) security.declareProtected(perm_edit_document, 'extractContext') def extractContext(self, body, start_node, end_node, start_offset, end_offset, start = 0, res = None, seen_start=0, seen_end=0, pre='', post='', node=None): """ returning n contexts of different sizes ['sentences', 'from', 'closes', 'to', 'five'] #precontent ['sentences', 'from', 'closes', 'to', 'five'] #postcontent """ if start: res = [] pre = post = '' seen_start = seen_end = 0 node = body # do stuff for x in node.childNodes: done_this = 0 if x.isSameNode(start_node): seen_start = 1 if x.isSameNode(end_node): seen_end = 1 if x.isSameNode(start_node): pre += x.nodeValue[:start_offset] done_this = 1 if x.isSameNode(end_node): post += x.nodeValue[end_offset:] done_this = 1 if not seen_start and not seen_end and not done_this: if x.nodeValue is None: pre += ' ' else: pre += x.nodeValue if not done_this and seen_start and not seen_end: # huh? it doesn't interest us? pass if seen_end and not done_this: if x.nodeValue is None: post += ' ' else: post += x.nodeValue if x.hasChildNodes(): seen_start, seen_end, pre, post = self.extractContext(body, start_node, end_node, start_offset, end_offset, 0, res, seen_start, seen_end, pre, post, x) if start: # do postprocessing i = len(pre) buff = '' pre_sent = [] while i>0: i += -1 if pre[i] == '.' or pre[i] == '!' or pre[i] == '?': pre_sent.append(buff) buff = '' buff = pre[i] + buff pre_sent = pre_sent[:5] i = 0 buff = '' post_sent = [] while i', '\n', htm) txt = txt.replace('\r', '') txt = re.sub('<.*?>', '', txt) #txt = txt.replace('\n', ' ') txt = re.sub('(?[0-9]+\.)([ \n]+)(?P(jaan|veeb|mär|apr|mai|juuni|juuli|august|septem|oktoob|nov|det|saj|eluaast))', re.I|re.S) res, subcount = p.subn(r'\g \g', intxt) if subcount > 0: print "date fix for %s" % self.getId(), subcount return res def _fix_names(self, res): """ fixes A. Kivikas """ pats = [re.compile('(?P( |\n))(?P.\.)\n+', re.I|re.S), re.compile('(?P( |\n))(?P.\.) \n+', re.I|re.S)] for p in pats: for x in range(2): subcount = -1 res, subcount = p.subn(r'\g\g ', res) if subcount > 0: print "name fix r%s for %s" % ( x, self.getId()), subcount else: print "." return res def _fix_shors(self, res): """ shorts like dr. """ pats = [re.compile(' (?P(dr|pr|mr)\.)\n', re.I|re.S)] for p in pats: res, subcount = p.subn(r' \g ', res) return res def _fix_dots(self, res): """ ... """ for x in range(1,20): res = res.replace('.\n.', '..') res = res.replace('...\n', '... ') res = res.replace('... \n', '... ') res = re.sub('(?P\(.*?)\n(?P.*?\))', r'\g\g', res) return res def _fix_years(self, intxt): p1 = re.compile('(?P[0-9]+\.)([\n ])+(?P(aasta|a\.))', re.I|re.S) p2 = re.compile('(?P[0-9]+\.)[ \n]+(?P-[0-9]+)', re.I|re.S) res, subcount = p1.subn(r'\g \g', intxt) if subcount > 0: print "year fix for %s" % self.getId(), subcount res, subcount = p2.subn(r'\g\g', res) if subcount > 0: print "year range fix for %s" % self.getId(), subcount return res security.declareProtected(perm_view_document, 'get_n_of_errors') def get_n_of_errors(self): codes = self.Errors.getDocumentMarks(self.getId()) return len(codes) security.declareProtected(perm_view_document, 'get_n_of_diff_errors') def get_n_of_diff_errors(self): res = [] for x in self.Errors.getDocumentMarks(self.getId()): if x.getProperty('code') not in res: res.append(x.getProperty('code')) return len(res) security.declareProtected(perm_view_document, 'getMarkedWords') def getMarkedWords(self): codes = self.Errors.getDocumentMarks(self.getId()) res = [] for x in codes: res.append(x.getProperty('content')) return ','.join(res) security.declareProtected(perm_view_document, 'getUsedCodes') def getUsedCodes(self): """ for indexing """ codes = self.Errors.getDocumentMarks(self.getId()) res = [] for x in codes: res.append(x.getProperty('code')) return res security.declareProtected(perm_view_document, 'prettyUsedCodes') def prettyUsedCodes(self, REQUEST): codes = self.Errors.getDocumentMarks(self.getId(), str(REQUEST.AUTHENTICATED_USER)) res = [] for x in codes: y = x.getProperty('code').encode('utf-8') xpinter = x.getProperty('pointer').encode('utf-8') res.append([y, xpinter, self.Marks.prettyCodeTitle(y)]) return res security.declareProtected(perm_view_document, 'getInheritedCodes') def getInheritedCodes(self, REQUEST): """ return inherited codes. uniqs only """ codes = self.Errors.getDocumentMarks(self.getId(), str(REQUEST.AUTHENTICATED_USER)) res = [] done = [] for x in codes: y = x.getProperty('code') code = self.Marks.getCode(y) #print ">>",code, code.aq_parent, code.aq_parent.meta_type try: tmp = code.aq_parent except AttributeError: continue while tmp.meta_type == 'mark': if tmp not in done: done.append(tmp) res.append(tmp.getTitle()) tmp = tmp.aq_parent return res security.declareProtected(perm_view_document, 'get_n_of_words') def get_n_of_words(self): """ words in document """ return self.getProperty('words') security.declareProtected(perm_view_document, 'get_n_of_sentences') def get_n_of_sentences(self): """ number of sentences in document """ return self.getProperty('sentences') security.declareProtected(perm_view_document, 'prettyDate') def prettyDate(self): """ pretty bobobase_mod.. time """ time = self.bobobase_modification_time().strftime('%d-%m-%Y %H:%M') return time security.declareProtected(perm_view_document, 'rawDate') def rawDate(self): """ raw date for indexing """ return self.bobobase_modification_time().ISO() security.declareProtected(perm_edit_document, 'countWords') def countWords(self): """ count words and sentences """ sentences, words = 0, 0 doc = self.getProperty('textdoc') sentences += doc.count('.') + doc.count('!') + doc.count('?') tempwords = doc.split() words += len(tempwords) self._updateProperty('words', words) self._updateProperty('sentences', sentences) return security.declareProtected(perm_manage, 'convertErrorsNG') def convertErrorsNG(self, start=1, doc=None, node=None, level=0, xnodes = []): """ convert errors """ if start: doc = xml.dom.minidom.parseString(self.getDocument()) node = doc.firstChild node = node.nextSibling #PrettyPrint(doc) xnodes = [] childs = node.childNodes dont_decr = 0 c = 0 c_tot = len(childs) while c" #if x.parentNode.nodeName=='error': # print level*' ', x.nodeValue #elif x.nodeType == Node.TEXT_NODE: # nodeval = x.nodeValue # print level*' ', nodeval.encode('utf-8') if x.nodeName.lower() == 'error': #print "---------------------------------------- removing" ts = node.childNodes i = 0 back = 0 look_node = 0 while i" if start: #PrettyPrint(doc) unis = {} for x in xnodes: if not unis.has_key(x[3]): unis[x[3]] = {} unis[x[3]][x[2]] = x[1] unis[x[3]]['xp'] = x[0] unis[x[3]]['code_id'] = x[4] if x[2] == u'start': unis[x[3]]['xp_start'] = x[0] else: unis[x[3]]['xp_end'] = x[0] from xml import xpath body = doc.getElementsByTagName('body')[0] add_counter = 0 for x in unis.keys(): if not unis[x].has_key('start'): continue if not unis[x].has_key('end'): continue # get starting node xp = unis[x]['xp_start'] if xp.startswith('/'): xp = xp[1:] start_node = xpath.Evaluate(xp, body)[0] # get ending node xp = unis[x]['xp_end'] if xp.startswith('/'): xp = xp[1:] end_node = xpath.Evaluate(xp, body)[0] if start_node.isSameNode(end_node): content = start_node.nodeValue[unis[x][u'start']:unis[x][u'end']] else: content = self.extractMarkedContent(start_node, end_node, unis[x][u'start'], unis[x][u'end']) pointer = unis[x]['xp_start']+'#off:'+str(unis[x][u'start']) pointer += ';' pointer += unis[x]['xp_end']+'#off:'+str(unis[x][u'end']) tmp = unis[x]['code_id'] try: code = tmp.split('-')[1] except IndexError: code = tmp author = code.split('_')[0] add_counter += 1 content = content.encode('utf-8') err = self.Errors.addNewError(pointer, content, self.getId(), code, author) pre, post = self.extractContext(body, start_node, end_node, unis[x][u'start'], unis[x][u'end'], start=1) pre.reverse() err.addPreContext(pre) err.addPostContext(post) print "--------- TOTAL ADD:", add_counter bodytag = doc.getElementsByTagName('body')[0] stri = bodytag.toxml() b = re.compile('()|()', re.I) stri = stri.encode('utf-8') res = b.sub('', stri) res = unicode(res, 'utf-8').encode('utf-8') fd = doc.toxml().encode('utf-8') fd = re.sub('<\?xml version="1.0" \?>', '', fd) self.fulldoc = fd self.body = res return "ok" security.declareProtected(perm_manage, 'mig_walker') def mig_walker(self, node, depth=0): """ migration walker. generated xpath from error nodes """ res = "" if node.parentNode and depth < 99 and node.parentNode.nodeName.lower() != 'body': res += self.mig_walker(node.parentNode, depth + 1) index = self.mig_siblingIndex(node) if node.nodeType == Node.ELEMENT_NODE: res += '/'+node.nodeName+'['+str(index)+']' #print "element node" elif node.nodeType == Node.DOCUMENT_NODE: print "document node" raise 'document node' elif node.nodeType == Node.TEXT_NODE: res += '/text()' if index > 1: res += '['+str(index)+']' return res security.declareProtected(perm_manage, 'mig_siblingIndex') def mig_siblingIndex(self, node): """ return sibling index """ siblings = node.parentNode.childNodes count = 0 res = 0 if node.nodeType == Node.ELEMENT_NODE: name = node.nodeName for x in siblings: if x.nodeType == Node.ELEMENT_NODE: if x.nodeName == name: count += 1 if x == node: res = count break elif node.nodeType == Node.TEXT_NODE: for x in siblings: if x.nodeType == Node.TEXT_NODE: count += 1 if x == node: res = count break return res security.declareProtected(perm_edit_document, 'get_uniq_words') def get_uniq_words(self): """ count words and sentences TODO: 11)Antut -> 11Antut, wrong! """ doc = self.textdoc3 tempwords = doc.split() res = {} printed = [] for wp in tempwords: org_org_word = wp split_sym = [')', '(', '/'] word = [wp] for sym in split_sym: tmp = [] for x in word: tmp += x.split(sym) word = tmp for w in word: if not w: continue org_w = w for i in ['.', ',', '"', '(', ')', '`', '?', '!', '“', '*', ';', ':','”', '→', '•', ' ·', '„', ']', '[']: w = w.replace(i, '') w = w.replace('`', '\'') w = w.lower() #if len(w) == 0: # if self.getId() not in printed: # print self.getId(), w, org_w # printed.append(self.getId()) for rem in ["´", "'", '-', '/']: if w.startswith(rem): w = w[1:] if w.endswith(rem): w = w[:-1] for rep in [['Õ', 'õ'], ['Ü', 'ü'], ['Ö', 'ö'], ['Ä', 'ä']]: w = w.replace(rep[0], rep[1]) if res.has_key(w): res[w] = res[w] + 1 else: res[w] = 1 if w == 'v': print 'V::', self.getId(), org_org_word if w.startswith('haal'): print "HAAL::", self.getId(), org_org_word return res def another_txt(self): import re p_start = re.compile('|', re.I) p_end = re.compile('

|', re.I) all = re.compile('<.*?>', re.I) txt = unicode(self.getBody(), 'utf-8') txt = convert_entities(txt).encode('utf-8') txt2 = re.sub(p_start, ' \n', txt) txt = re.sub(p_end, ' \n', txt2) value = re.sub(all, '',txt) self.textdoc3 = value #if self.getId() == 'doc_491521501739_item': # print "="*40 # print txt # print "="*40 # print value def getMorfoView(self): x = self.sqls.morfoRaw(docid=self.getId()) if len(x) == 0: self.sqls.morfoInsert(docid=self.getId(), morfotext=self.Search.morfoVastus(tekst=self.toText().encode('utf-8'))) x=self.sqls.morfoRaw(docid=self.getId()) #return "Not available" return x[0][0] def _updateDocStatistics(self): """ doctest """ from zope.component import getUtility from textindexng.interfaces import IConverter, ISplitter c = getUtility(IConverter, 'text/html') su = getUtility(ISplitter, name="txng.splitters.simple") doc = unicode(self.fulldoc, 'utf-8') cvtr, encoding = c.convert(doc) cvt = unicode(cvtr, encoding) spl = su.split(cvt) self.sqls.deleteDocref(docid=self.getId()) for s in spl: if not isinstance(s, unicode): raise 'not a unicode!' self.sqls.docsInsert(word=s, docid=self.getId(), language=getattr(self, 'docLanguage'), corpus=self.getCorpus()) return def _storeDocInfos(self): self.sqls.storeDocument( docid=self.getId(), title=self.getTitle(), language=getattr(self, 'docLanguage', 'x'), corpus=self.getCorpus()) # self.sqls.storeDocContent( # docid = self.getId(), # html = self.getBody(), # plaintext = self.toText(self.getBody())) self.sqls.storeDocContent( docid = self.getId(), html = "", plaintext = self.toText(self.getBody())) # ss = self.getSentences() # self.sqls.deleteSentencesByDocument(docid=self.getId()) # for s in ss: # self.sqls.addSentence(docid = self.getId(), lang=getattr(self, 'docLanguage', 'x'), sentence=s) def get_text(elem): """ used only to extract documents title... """ res=u'' for line in elem.childNodes: res += line.nodeValue return res.encode('utf-8') Globals.InitializeClass(Document)