from AccessControl import ClassSecurityInfo from OFS.SimpleItem import SimpleItem from zope.interface import implements from Globals import InitializeClass from interfaces import IWordTree from permissions import perm_manage import logging logger=logging.getLogger("korpus") class WordTree(SimpleItem): """ wordtree """ implements(IWordTree) meta_type = 'WordTree' security = ClassSecurityInfo() security.declareObjectPublic() id = 'wordtree' def getStartChars(self): """ ad """ d = self.sqls.selectFirstChars() return [ c['char'] for c in d ] def getWordsBy(self): s = self.REQUEST.get('char') so = self.REQUEST.get('by', 'freq') cfrom=self.REQUEST.get('cfrom', '1') cto=self.REQUEST.get('cto', '1000000') korpus=self.REQUEST.SESSION.get('korpus', None) s = unicode(s, 'utf-8') # if len(s) != 1: s = u'a' if len(s) > 1: s = u'a' sortorder = 'count DESC' if so == 'alpha': sortorder = 'word' if so == 'prim': sortorder = 'prim' if so == 'lemma': sortorder = 'lemma' if korpus: kstr=",".join(["'"+k+"'" for k in korpus]) sqlres = self.sqls.getWordsByCorpus2(char=s+'%', order=sortorder, cfrom=cfrom, cto=cto, corpus=kstr) #sqlres = self.sqls.getWordsByCorpus(char=s+'%', order=sortorder, cfrom=cfrom, cto=cto, corpus=korpus) else: sqlres = self.sqls.getWords(char=s+'%', order=sortorder, cfrom=cfrom, cto=cto) res = [] for r in sqlres: res.append([r[0], r[1], r[3], r[2]]) return res # CREATE OR REPLACE FUNCTION reverse(text) RETURNS text AS $$ # SELECT array_to_string(ARRAY( # SELECT substring($1, s.i,1) FROM generate_series(length($1), 1, -1) AS s(i) # ), ''); # $$ LANGUAGE SQL IMMUTABLE STRICT; def getWordsReverseBy(self): s = self.REQUEST.get('char', '') so = self.REQUEST.get('by', 'alpha') cfrom=self.REQUEST.get('cfrom', '1') cto=self.REQUEST.get('cto', '1000000') s = unicode(s, 'utf-8') sortorder = 'alpha' if so == 'alpha': sortorder = 'reverse(word)' if so == 'word': sortorder = 'word' if so == 'freq': sortorder = 'count DESC' if so == 'prim': sortorder = 'prim' if so == 'lemma': sortorder = 'lemma' sqlres = self.sqls.getWordsReverse(char='%'+s, order=sortorder, cfrom=cfrom, cto=cto) res = [] for r in sqlres: res.append([r[0], r[1], r[3], r[2]]) return res def getSyllableGlobalCount(self, syllable): "syllablesglobalcounts" sqlres = self.sqls.getSyllableGlobalCount(syllable=syllable) for r in sqlres: return str(r[0])+";"+syllable # CREATE FUNCTION last_post(text,char) RETURNS integer AS $$ # select length($1)- length(regexp_replace($1, '.*' || $2,'')); # $$ LANGUAGE SQL IMMUTABLE; def getSyllableEnds(self, orderby="count"): "wordsyllableends" sqlres = self.sqls.groupBySyllableEnd() res =[] for r in sqlres: a = r[1].strip() if a[0] =="-":a = a[1:] b=a[::-1] res.append([r[0], r[1].strip(), a, b]) from operator import itemgetter if orderby=='count': res.sort(key=itemgetter(0)) if orderby=='countreverse': res.sort(key=itemgetter(0)) res.reverse() if orderby=='syllable': res.sort(key=itemgetter(1)) if orderby=='syllablereverse': res.sort(key=itemgetter(3)) return res def getGroupCounts(self, m): "grouping" t={} n=list(set(m)) #erinevad for x in n: t[x]=0 for w in m: t[w]+=1 return t def textToSession(self): "texttosession" text = self.REQUEST.get('tekst', "") if text: self.REQUEST.SESSION.set('usertext', text) self.REQUEST.SESSION.set('lemmas', None) self.REQUEST.SESSION.set('syllables', None) def textFromSession(self): "textfromsession" return self.REQUEST.SESSION.get('usertext', "Lorem ipsum") def getUserTextWords(self): "usertextwords" import re return [word.strip().lower() for word in re.split('[\.?!:;," ]', self.textFromSession()) if word] def getUserTextWordsAlpha(self): "usertextwordsinalphabeticalorder" words= self.getUserTextWords() words.sort() return words def getUserTextWordsReverse(self): "usertextwordsinreversealpha" m=[w[::-1] for w in self.getUserTextWords()] m.sort() return [w[::-1] for w in m] def getUserTextSyllables(self): "usertextsyllables" if self.REQUEST.SESSION.get("syllables", None): return self.REQUEST.SESSION.get("syllables") words = self.getUserTextWords() res= [] for r in words: try: res.append([r, self.Search.getSyllables(r).strip()]) except: res.append([r, r]) self.REQUEST.SESSION.set("syllables", res) return res def getUserTextEndSyllables(self): "usertextendsyllables" res=[] for r in self.getUserTextSyllables(): res.append(r[1].split('-')[-1]) return res def getUserTextStartSyllables(self): "usertextstartsyllables" res=[] for r in self.getUserTextSyllables(): res.append(r[1].split('-')[0]) return res def getUserTextMiddleSyllables(self): "usertextmiddlesyllables" res=[] for r in self.getUserTextSyllables(): m=r[1].split('-') if len(m)>2: for syllable in m[1:-1]: res.append(syllable) return res def getUserTextSyllableCounts(self, syllablekey='syllablecount', desc=0, location='start'): "usertextsyllablecounts" if location=='start': syllables = self.getUserTextStartSyllables() if location=='middle': syllables = self.getUserTextMiddleSyllables() if location=='end': syllables = self.getUserTextEndSyllables() res =[] m=self.getGroupCounts(syllables) for r in m.keys(): help=r.strip() help2=help[::-1] res.append([r, m[r], help, help2]) from operator import itemgetter if syllablekey=='syllablealpha': res.sort(key=itemgetter(0)) if syllablekey=='syllablereverse': res.sort(key=itemgetter(3)) if syllablekey=='syllablecount': res.sort(key=itemgetter(1)) if desc: res.reverse() return res def getMorfo(self, text): "morfo" import urllib params=urllib.urlencode({'tekst': text}) addr="http://evkk.tlu.ee/Search/morfoVastus?"+params f=urllib.urlopen(addr) s=f.read() return s def getTextLemmas(self, text=""): "lemmad" if text=="" and self.REQUEST.SESSION.get("lemmas", None): return self.REQUEST.SESSION.get("lemmas") if text=="": text=self.textFromSession() s=self.getMorfo(text) m=s.strip().split("\n") t={} abi=[] i=0 while i0: if m[i][0]=='"' and i>0 and i=end: end = start + 40 if not filt: sr = self.sqls.getUndecided(limit=40, offset=start) else: filt = filt.replace('%', '') filt = filt.replace("'", '') modfilt = "'" if filt.startswith('*'): modfilt += '%' modfilt += filt if filt.endswith('*'): modfilt += '%' modfilt += "'" modfilt = modfilt.replace('*', '') sr = self.sqls.getUndecidedFilter(limit=40, offset=start, filter=modfilt) res = {} for r in sr: if not res.has_key(r[1]): res[r[1]] = [] res[r[1]].append({'id': r[0], 'option': r[2], 'language': r[3]}) return res def nOfUndecided(self): return self.sqls.getNofUndecided()[0][0] def nOfMorf(self): return self.sqls.getNofWords()[0][0] def getNewStart(self): # for previous link s = self.REQUEST.get('start', 0) if s != 0: s -= 40 if s < 0: s = 0 return s def getNewEnd(self): # for next link s = self.REQUEST.get('start', None) e = self.REQUEST.get('end', None) r = 40 if e is None and s is not None: r = s+40 return r security.declareProtected(perm_manage, 'unifyWords') def unifyWords(self, REQUEST): """ unify words """ for k in REQUEST.form.keys(): if k.endswith('_custom') or k.endswith('_alias') or k == 'unifyButton' or k == 'start': continue v = REQUEST.get(k) if k.startswith('option_'): # tick option with a ID in undecided table self.sqls.tickOption(id=v) else: # unik is a word # v is an action to be done unik = unicode(k, 'utf-8') morf = unicode(REQUEST.get(k+'_'+v), 'utf-8') # _cursom if morf[0] == '?': morf[1:] if v == 'custom': self.sqls.tickWithData(word=unik, option=morf) elif v == 'alias': # add alias self.sqls.addAlias(fromword=unik, toword=morf) else: raise 'unknown stuff!', REQUEST return REQUEST.RESPONSE.redirect(self.absolute_url()+'?sunif=1') def getDocumentBackrefs(self, word): # get word info korpus=self.REQUEST.SESSION.get('korpus', None) w = unicode(word, 'utf-8') res = [] sr = self.sqls.getDocWordStats(word=w) for r in sr: did = r[0] dc = r[1] title = r[2] if len(title.strip()) == 0: title = did if korpus: # pass try: if getattr(self.Documents, did).getCorpus() in korpus: res.append([did, dc, title]) except: pass else: res.append([did, dc, title]) return res def getWordCount(self, word): # get wordcount w = unicode(word, 'utf-8') korpus=self.REQUEST.SESSION.get('korpus', None) if korpus: kstr=",".join(["'"+k+"'" for k in korpus]) return self.sqls.getWordCountByCorpus(word=word, corpus=kstr)[0][0] return self.sqls.getWordCount(word=word)[0][0] def getTotalWordCount(self): korpus=self.REQUEST.SESSION.get('korpus', None) if korpus: kstr=",".join(["'"+k+"'" for k in korpus]) return self.sqls.getTotalWordCountByCorpus(corpus=kstr)[0][0] return self.sqls.getTotalWordCount()[0][0] def getIsAlias(self, word): refs = self.sqls.getRefFrom(word=word) if len(refs) > 0: return [ r[0] for r in refs ] return False def getHasAlias(self, word): refs = self.sqls.getRefTo(word=word) if len(refs) > 0: return [ r[0] for r in refs ] return False def getWordsByLemma(self, lemma): # return words with a same lemma lems = self.sqls.wordsByLemma(lemma=lemma) r = [] for l in lems: r.append(l[0]) return r def getWordsBySyllable(self, syllable): "syllables" words= self.sqls.wordsBySyllable(syllable=syllable) res=[] for r in words: res.append([r[0],r[1], r[3 ]]) return res def calculateSyllableTree(self, m, reverse=0): "input - array with syllabled words" root=SyllableTreeNode("", 0) for r in m: syllables=r.split("-") if reverse: syllables.reverse() root.addSyllables(syllables) return root def getUserTextSyllableTreeText(self): "Tree from user data" return self.calculateSyllableTree([r[1] for r in self.getUserTextSyllables()]).getString() def getWordsSyllableTreeText(self, type='string', reverse=0): "Tree from user data" tree=self.calculateSyllableTree([r[1] for r in self.sqls.getWordsSyllables()], reverse=reverse) if type=="xml": self.REQUEST.RESPONSE.setHeader('Content-type', 'text/xml') return ""+tree.getXML() if reverse: return tree.getReverseString() return tree.getString() InitializeClass(WordTree) class SyllableTreeNode: "node" def __init__(self, value, level): self.value=value self.children={} self.count=0 self.level=level def addSyllables(self, syllables): self.count+=1 logger.info(syllables) if(syllables): if syllables[0] not in self.children.keys(): self.children[syllables[0]]=SyllableTreeNode(syllables[0], self.level+1) self.children[syllables[0]].addSyllables(syllables[1:]) def getString(self): t=self.level*" "+self.value+" ("+str(self.count)+")\n" m=self.children.values() m=sorted(m, key=lambda c: -c.count) for c in m: t+=c.getString() # for c in self.children.keys(): # t+=self.children[c].getString() return t def getReverseString(self): m=self.children.values() m=sorted(m, key=lambda c: -c.count) t="" for c in m: t+=c.getReverseString() t+=(10-self.level)*" "+self.value+" ("+str(self.count)+")\n" return t def getXML(self): t=self.level*" "+"\n" m=self.children.values() m=sorted(m, key=lambda c: -c.count) for c in m: t+=c.getXML() t+="" return t