# -*- coding: utf-8 # Copyright 2004-2006 by Vahur Rebas import re import logging import Globals from Globals import Acquisition, Persistent from AccessControl import ClassSecurityInfo from AccessControl import getSecurityManager from Acquisition import aq_base, aq_inner, aq_parent, Implicit from OFS.SimpleItem import SimpleItem from OFS.PropertyManager import PropertyManager from zope.interface import implements import random import xml.dom.minidom from xml.dom.minidom import Document from xml.dom import Node #from xml.dom.ext import PrettyPrint #from Products.ZCatalog.CatalogAwareness import CatalogAware from Products.ZCatalog.CatalogPathAwareness import CatalogAware from textindexng.interfaces import IIndexableContent from textindexng.interfaces import IConverter, ISplitter from textindexng.converters import html, sgml, ooffice, pdf from textindexng.converters.entities import convert_entities import types from interfaces import IDocument from permissions import * from schemas import get_schema from utils import invokeScript logger = logging.getLogger('Korpus') class Document(Persistent,SimpleItem,CatalogAware, PropertyManager): """ One document """ implements(IDocument, IIndexableContent) meta_type = 'Document' security = ClassSecurityInfo() #security.declareObjectProtected(perm_view,) security.declareObjectPublic() #XXX: this doesn't work! manage_options = PropertyManager.manage_options+SimpleItem.manage_options #manage_options=( # {'label':'View', # 'action':'document_view'}, # )+SimpleItem.manage_options def __init__(self, _id, _doc, _schema): """ init """ self.id = _id self.fulldoc = _doc self.default_catalog = 'zcatalog' self.org_fulldoc = '' # original full document. without marked text self.org_title = '' # original title getOrgTitle self.org_description = '' # original description getOrgDescription self.org_body = '' # original body getOrgBody self._setProperty('document_status', 0, 'int') # getStatus self._setProperty('is_deleted', 0, 'boolean') self.title = '' # getTitle self.description = '' # getDescription self.body = '' # getBody. self._setProperty('textdoc', '', 'text') self._setProperty('words', 0, 'int') self._setProperty('sentences', 0, 'int') self._setProperty('lastModifier', '', 'string') # document schema self._document_schema = _schema self._corpus = '' self._dsstore = {} self.textcontent = "" self._tmp_doc = True def __getattr__(self, name): if name in get_schema(self.getMetaSchema()).getFieldNames(): return get_schema(self.getMetaSchema()).getValue(name, self) raise AttributeError security.declarePrivate('manage_afterAdd') def manage_afterAdd(self,obj,container): """ ... """ print "manage_afterAdd" self.title = self.extractTitle() self.description = self.extractDescription() self.body = self.extractBody() self._updateProperty('textdoc', self.toText()) #self.extractErrors() self.countWords() self.index_object() self._updateDocStatistics() self._storeDocInfos() #self.setGlobalUsedTongues(self.getTongue()) #self.setGlobalUsedNations(self.getNation()) security.declarePrivate('manage_beforeDelete') def manage_beforeDelete(self, item, container): """ unindex object before deletion """ self.unindex_object() security.declareProtected(perm_view_document, 'getId') def getId(self): """ return id """ return self.id security.declareProtected(perm_view_document, 'getTitle') def getTitle(self): """ return title """ return self.title def getMetaSchema(self): """ return schema """ return self._document_schema def getCorpus(self): " return corpus " k=getattr(self, '_corpus', 'cFOoRQekA') if k=='': return 'cFOoRQekA' if k=='main': return 'cFOoRQekA' if k is None: return 'cFOoRQekA' return k #return getattr(self, '_corpus', '') security.declareProtected(perm_manage, 'setCorpus') def setCorpus(self, cid): """ set a corpus id eg. assign a document to korpus """ self._corpus = cid security.declareProtected(perm_view_document, 'getDescription') def getDescription(self): """ return description """ return self.description security.declareProtected(perm_view_document, 'transformToView') def transformToView(self, REQUEST=None): """ return body """ body = self.getBody() if REQUEST: sw=REQUEST.get('searchword', None) if sw: p=re.compile("(\\b"+sw+"\\b)", re.I) body=p.sub(r"\1", body) # body=body.replace(' '+sw+' ', " "+sw+" ") # regex=re.compile(r"(* )"+sw+"([\\.,?!]*)", re.MULTILINE) # body=regex.sub("\\1"+sw+"\\2") return body def isDeleted(self): return self.getProperty('is_deleted') security.declareProtected(perm_del_documents, 'deleteMe') def deleteMe(self): self._updateProperty('is_deleted', 1) self.sqls.docDeleted(docid=self.getId()) security.declareProtected(perm_view_document, 'getBody') def getBody(self): """ return body """ return self.body security.declareProtected(perm_view_document, 'indexableContent') def indexableContent(self, fields): from textindexng.content import IndexContentCollector as ICC icc = ICC() doc = unicode(self.fulldoc, 'utf-8') icc.addBinary('getDocument', doc, 'text/html') #icc.addContent('getMarkedWords', unicode(self.getMarkedWords(), 'utf-8')) marked = self.getMarkedWords() if type('') == type(marked): marked = unicode(marked, 'utf-8') icc.addContent('getMarkedWords', marked) icc.addContent('getTitle', unicode(self.getTitle(), 'utf-8')) return icc security.declareProtected(perm_view_document, 'getDocument') def getDocument(self): """ doc """ self.REQUEST.RESPONSE.setHeader("Content-type","text/html; charset=UTF-8") return self.fulldoc def getVisibleText(self): return self.textcontent security.declareProtected(perm_view_document, 'getStatus') def getStatus(self): """ return document status 0 - document is in editing mode 1 - document is in marking mode """ return self.getProperty('document_status') security.declareProtected(perm_view_document, 'getSubmitter') def getSubmitter(self): """ get the username who created document """ return self.getOwnerTuple()[1] security.declareProtected(perm_view_document, 'getLastModder') def getLastModder(self): """ last modifier of the document """ try: return self.lastModifier except AttributeError: return -1 security.declareProtected(perm_change_status, 'start_modding') def start_modding(self, REQUEST): """ start modding document we should copy fulldoc, body, title, desc etc to orginial document holding attributes """ if not self.getStatus(): self._updateProperty('document_status', 1) self.sqls.docStatus(docid=self.getId(), status=1) import copy self.org_fulldoc = copy.deepcopy(self.fulldoc) self.org_title = copy.deepcopy(self.title) self.org_description = copy.deepcopy(self.description) self.org_body = copy.deepcopy(self.body) self.reindex_object() return REQUEST.RESPONSE.redirect('mark.html') security.declareProtected(perm_edit_document, 'extractTitle') def extractTitle(self): """ exract title from Document """ doc = xml.dom.minidom.parseString(self.getDocument()) title = doc.getElementsByTagName('title')[0] return get_text(title) security.declareProtected(perm_edit_document, 'extractDescription') def extractDescription(self): """ extract description from Document """ doc = xml.dom.minidom.parseString(self.getDocument()) desc = doc.getElementsByTagName('meta') for x in desc: if x.getAttribute('name') == 'description': return x.getAttribute('content') return "" security.declareProtected(perm_edit_document, 'extractBody') def extractBody(self): """ extract head from document """ doc = xml.dom.minidom.parseString(self.getDocument()) bodytag = doc.getElementsByTagName('body')[0] stri = bodytag.toxml() #print stri b = re.compile('(
)|()', re.I) stri = stri.encode('utf-8') res = b.sub('', stri) print print "RES:" print res res = unicode(res, 'utf-8').encode('utf-8') return res security.declareProtected(perm_edit_document, 'saveDocument') def saveDocument(self,REQUEST): """ save modified document """ self.fulldoc = REQUEST.kupu self.manage_afterAdd(self,self.aq_parent) self.sqls.morfoDelete(docid=self.getId()) self._tmp_doc = False try: self._updateProperty('lastModifier', str(REQUEST.AUTHENTICATED_USER)) except: self._setProperty('lastModifier', str(REQUEST.AUTHENTICATED_USER), 'string') return REQUEST.RESPONSE.redirect(self.absolute_url()) def saveInformat(self, REQUEST): """ save metadata """ get_schema(self.getMetaSchema()).update(self, REQUEST) self._p_changed = True self.reindex_object() return security.declareProtected(perm_view_document, 'getMarks') def getMarks(self, REQUEST): """ return marks. for marking page """ return self.Errors.getDocumentMarksHTML(self.getId(), str(REQUEST.AUTHENTICATED_USER)) security.declareProtected('do_not_touch_me', 'saveMarksNG_rescue') def saveMarksNG_rescue(self, REQUEST): """ saving mark pointers """ count = 0 from xml import xpath # determine how many pointer we have coun = 1000 for x in range(1,coun): pair = REQUEST.get('range_'+str(x), None) if not pair: continue pair = eval(pair) print "-->", x, pair r = pair[0] pointer = pair[0] try: code = pair[1] except IndexError: print "index error 1", pair continue existing = None try: existing = pair[2] except IndexError: pass if existing: if existing == 'DELETEME': continue if 'DELETEME' in pair: self.Errors.deleteError(existing) print "existing...", pair continue r_st, r_en = r.split(';') doc = xml.dom.minidom.parseString(self.getDocument()) start, st_offset = r_st.split('#') st_offset = st_offset.split(':')[1] end, en_offset = r_en.split('#') en_offset = en_offset.split(':')[1] body = doc.getElementsByTagName('body')[0] start = start.lower() end = end.lower() if start.startswith('/'): start = start[1:] if end.startswith('/'): end = end[1:] print "....", x, start start_node = xpath.Evaluate(start, body)[0] # always take the end_node = xpath.Evaluate(end, body)[0] # first node we get if start_node.isSameNode(end_node): content = start_node.nodeValue[int(st_offset):int(en_offset)] else: content = self.extractMarkedContent(start_node, end_node, int(st_offset), int(en_offset)) author = REQUEST.get('username') if type(content) == types.UnicodeType: content = content.encode('utf-8') err = self.Errors.addNewError(pointer, content, self.getId(), code, author) pre, post = self.extractContext(body, start_node, end_node, int(st_offset), int(en_offset), start=1) pre.reverse() err.addPreContext(pre) err.addPostContext(post) self.reindex_object() return REQUEST.RESPONSE.redirect(self.absolute_url()) security.declareProtected(perm_mark_document, 'saveMarksNG') def saveMarksNG(self, REQUEST): """ saving mark pointers """ #logger.info("Harry katsetus") #logger.info(REQUEST) count = 0 from xml import xpath # determine how many pointer we have coun = 1000 for x in range(1,coun): pair = REQUEST.get('range_'+str(x), None) if not pair: continue r = pair[0] pointer = pair[0] try: code = pair[1] except IndexError: continue existing = None try: existing = pair[2] except IndexError: pass if existing: if existing == 'DELETEME': continue if 'DELETEME' in pair: self.Errors.deleteError(existing) continue r_st, r_en = r.split(';') doc = xml.dom.minidom.parseString(self.getDocument()) start, st_offset = r_st.split('#') st_offset = st_offset.split(':')[1] end, en_offset = r_en.split('#') en_offset = en_offset.split(':')[1] body = doc.getElementsByTagName('body')[0] start = start.lower() end = end.lower() if start.startswith('/'): start = start[1:] if end.startswith('/'): end = end[1:] try: start_node = xpath.Evaluate(start, body)[0] # always take the except IndexError, ie: # # this is causing bug! # start == span/p[8]/span/text() # should be p[8]/span/text() raise ie end_node = xpath.Evaluate(end, body)[0] # first node we get if start_node.isSameNode(end_node): content = start_node.nodeValue[int(st_offset):int(en_offset)] else: content = self.extractMarkedContent(start_node, end_node, int(st_offset), int(en_offset)) author = str(REQUEST.AUTHENTICATED_USER) if type(content) == types.UnicodeType: content = content.encode('utf-8') #logger.info("Harry katsetused-8") #logger.info(body) #logger.info(start_node) #logger.info(end_node) #logger.info(int(st_offset)) #logger.info(int(en_offset)) #logger.info(start) pre, post = self.extractContext(body, start_node, end_node, int(st_offset), int(en_offset), start=1) pre.reverse() err = self.Errors.addNewError(pointer, content, self.getId(), code, author, pre, post) self.reindex_object() return REQUEST.RESPONSE.redirect(self.absolute_url()) security.declareProtected(perm_edit_document, 'extractContext') def extractContext(self, body, start_node, end_node, start_offset, end_offset, start = 0, res = None, seen_start=0, seen_end=0, pre='', post='', node=None): """ returning n contexts of different sizes ['sentences', 'from', 'closes', 'to', 'five'] #precontent ['sentences', 'from', 'closes', 'to', 'five'] #postcontent """ if start: res = [] pre = post = '' seen_start = seen_end = 0 node = body # do stuff for x in node.childNodes: done_this = 0 if x.isSameNode(start_node): seen_start = 1 if x.isSameNode(end_node): seen_end = 1 if x.isSameNode(start_node): pre += x.nodeValue[:start_offset] done_this = 1 if x.isSameNode(end_node): post += x.nodeValue[end_offset:] done_this = 1 if not seen_start and not seen_end and not done_this: if x.nodeValue is None: pre += ' ' else: pre += x.nodeValue if not done_this and seen_start and not seen_end: # huh? it doesn't interest us? pass if seen_end and not done_this: if x.nodeValue is None: post += ' ' else: post += x.nodeValue if x.hasChildNodes(): seen_start, seen_end, pre, post = self.extractContext(body, start_node, end_node, start_offset, end_offset, 0, res, seen_start, seen_end, pre, post, x) if start: # do postprocessing i = len(pre) buff = '' pre_sent = [] while i>0: i += -1 if pre[i] == '.' or pre[i] == '!' or pre[i] == '?': pre_sent.append(buff) buff = '' buff = pre[i] + buff pre_sent = pre_sent[:5] i = 0 buff = '' post_sent = [] while i