from AccessControl import ClassSecurityInfo from OFS.SimpleItem import SimpleItem from zope.interface import implements from Globals import InitializeClass from interfaces import IWordTree from permissions import perm_manage import logging logger=logging.getLogger("korpus") class WordTree(SimpleItem): """ wordtree """ implements(IWordTree) meta_type = 'WordTree' security = ClassSecurityInfo() security.declareObjectPublic() id = 'wordtree' def getStartChars(self): """ ad """ d = self.sqls.selectFirstChars() return [ c['char'] for c in d ] def getWordsBy(self): s = self.REQUEST.get('char') so = self.REQUEST.get('by', 'freq') cfrom=self.REQUEST.get('cfrom', '1') cto=self.REQUEST.get('cto', '1000000') s = unicode(s, 'utf-8') # if len(s) != 1: s = u'a' if len(s) > 1: s = u'a' sortorder = 'count DESC' if so == 'alpha': sortorder = 'word' if so == 'prim': sortorder = 'prim' if so == 'lemma': sortorder = 'lemma' sqlres = self.sqls.getWords(char=s+'%', order=sortorder, cfrom=cfrom, cto=cto) res = [] for r in sqlres: res.append([r[0], r[1], r[3], r[2]]) return res def getWordsReverseBy(self, pre='%', post=''): s = self.REQUEST.get('char', '') so = self.REQUEST.get('by', 'alpha') cfrom=self.REQUEST.get('cfrom', '1') cto=self.REQUEST.get('cto', '1000000') s = unicode(s, 'utf-8') # if len(s) != 1: s = u'a' # if len(s) > 1: s = u'a' sortorder = 'alpha' if so == 'alpha': sortorder = 'reverse(word)' if so == 'word': sortorder = 'word' if so == 'freq': sortorder = 'count DESC' if so == 'prim': sortorder = 'prim' if so == 'lemma': sortorder = 'lemma' sqlres = self.sqls.getWordsReverse(char=pre+s+post, order=sortorder, cfrom=cfrom, cto=cto) res = [] for r in sqlres: res.append([r[0], r[1], r[3], r[2], r[5]]) return res def getSyllableEnds(self, orderby="count"): "endsyllables" sqlres = self.sqls.groupBySyllableEnd() res =[] for r in sqlres: abi = r[1].strip() if abi: if abi[0] =="-":abi = abi[1:] abi2=abi[::-1] res.append([r[0], r[1].strip(), abi, abi2]) from operator import itemgetter if orderby=='count': res.sort(key=itemgetter(0)) if orderby=='countreverse': res.sort(key=itemgetter(0)) res.reverse() if orderby=='syllable': res.sort(key=itemgetter(1)) if orderby=='syllablereverse': res.sort(key=itemgetter(3)) return res def getSyllableBegins(self, orderby="count"): "beginsyllables" sqlres = self.sqls.groupBySyllableBegin() res =[] for r in sqlres: abi = r[1].strip() if abi[-1] =="-":abi = abi[:-1] abi2=abi[::-1] res.append([r[0], r[1].strip(), abi, abi2]) from operator import itemgetter if orderby=='count': res.sort(key=itemgetter(0)) if orderby=='countreverse': res.sort(key=itemgetter(0)) res.reverse() if orderby=='syllable': res.sort(key=itemgetter(1)) if orderby=='syllablereverse': res.sort(key=itemgetter(3)) return res def getSyllableMiddles(self, orderby="count"): "middlesyllables" sqlres = self.sqls.groupBySyllableMiddles() res =[] for r in sqlres: abi = r[1].strip() if abi: if abi[-1] =="-":abi = abi[:-1] abi2=abi[::-1] res.append([r[0], r[1].strip(), abi, abi2]) from operator import itemgetter if orderby=='count': res.sort(key=itemgetter(0)) if orderby=='countreverse': res.sort(key=itemgetter(0)) res.reverse() if orderby=='syllable': res.sort(key=itemgetter(1)) if orderby=='syllablereverse': res.sort(key=itemgetter(3)) return res def getSyllables(self, orderby="count"): "syllables" sqlres = self.sqls.groupBySyllables() res =[] for r in sqlres: abi = r[1].strip() if abi: if abi[-1] =="-":abi = abi[:-1] abi2=abi[::-1] res.append([r[0], r[1].strip(), abi, abi2]) from operator import itemgetter if orderby=='count': res.sort(key=itemgetter(0)) if orderby=='countreverse': res.sort(key=itemgetter(0)) res.reverse() if orderby=='syllable': res.sort(key=itemgetter(1)) if orderby=='syllablereverse': res.sort(key=itemgetter(3)) return res def getSyllableGlobalCount(self, syllable): "syllablesglobalcount" sqlres = self.sqls.getSyllableGlobalCount(syllable=syllable) for r in sqlres: return str(r[0])+";"+syllable def textToSession(self): "texttosession" text = self.REQUEST.get('tekst', "") if text: self.REQUEST.SESSION.set('usertext', text) self.REQUEST.SESSION.set('lemmas', None) self.REQUEST.SESSION.set('syllables', None) def textFromSession(self): "textfromsession" return self.REQUEST.SESSION.get('usertext', "Lorem ipsum") def getUserTextWords(self): "usertextwords" import re return [word.strip().lower() for word in re.split('[\.?!:;,“„" ]', self.textFromSession()) if word] def getUserTextWordsAlpha(self): "usertextwordsinalphabeticalorder" words= self.getUserTextWords() words.sort() return words def getUserTextWordsReverse(self): "usertextwordsinreversealpha" m=[w[::-1] for w in self.getUserTextWords()] m.sort() return [w[::-1] for w in m] def getUserTextSyllables(self): "usertextsyllables" if self.REQUEST.SESSION.get("syllables", None): return self.REQUEST.SESSION.get("syllables") words = self.getUserTextWords() res= [] for r in words: try: res.append([r, self.Search.katse20(r).strip()]) except: res.append([r, "error"]) self.REQUEST.SESSION.set("syllables", res) return res def getUserTextEndSyllables(self): "usertextendsyllables" res=[] for r in self.getUserTextSyllables(): res.append(r[1].split('-')[-1]) return res def getUserTextStartSyllables(self): "usertextstartsyllables" res=[] for r in self.getUserTextSyllables(): res.append(r[1].split('-')[0]) return res def getUserTextMiddleSyllables(self): "usertextmiddlesyllables" res=[] for r in self.getUserTextSyllables(): m=r[1].split('-') if len(m)>2: for syllable in m[1:-1]: res.append(syllable) return res def getUserTextEndSyllableCounts(self, syllablekey='syllablecount', desc=0): "usertextendsyllablecounts" syllables = self.getUserTextEndSyllables() res =[] m=self.getGroupCounts(syllables) for r in m.keys(): abi=r.strip() if abi[0]=="-": abi=abi[1:] abi2=abi[::-1] res.append([r, m[r], abi, abi2]) from operator import itemgetter if syllablekey=='syllablealpha': res.sort(key=itemgetter(0)) if syllablekey=='syllablereverse': res.sort(key=itemgetter(3)) if syllablekey=='syllablecount': res.sort(key=itemgetter(1)) if desc: res.reverse() return res def getUserTextStartSyllableCounts(self, syllablekey='syllablecount', desc=0): "usertextstartsyllablecounts" syllables = self.getUserTextStartSyllables() res =[] m=self.getGroupCounts(syllables) for r in m.keys(): help=r.strip() help2=help[::-1] res.append([r, m[r], help, help2]) from operator import itemgetter if syllablekey=='syllablealpha': res.sort(key=itemgetter(0)) if syllablekey=='syllablereverse': res.sort(key=itemgetter(3)) if syllablekey=='syllablecount': res.sort(key=itemgetter(1)) if desc: res.reverse() return res def getUserTextMiddleSyllableCounts(self, syllablekey='syllablecount', desc=0): "usertextmiddlesyllablecounts" silbid = self.getUserTextMiddleSyllables() res =[] m=self.getGroupCounts(silbid) for r in m.keys(): abi=r.strip() if abi[0]=="-": abi=abi[1:] abi2=abi[::-1] res.append([r, m[r], abi, abi2]) from operator import itemgetter if syllablekey=='syllablealpha': res.sort(key=itemgetter(0)) if syllablekey=='syllablereverse': res.sort(key=itemgetter(3)) if syllablekey=='syllablecount': res.sort(key=itemgetter(1)) if desc: res.reverse() return res def getGroupCounts(self, m): "grupeerimine" t={} n=list(set(m)) #erinevad for x in n: t[x]=0 for w in m: t[w]+=1 return t def getUserTextWordsCounts(self): "sonade sagedused" return self.getGroupCounts(self.getUserTextWordsAlpha()) def getUserSortedWordsCounts(self, key='word', desc=0): "sorteeritud paarid" m=self.getUserTextWordsCounts() n=list(set(m)) if key=='word': n.sort() if key=='reverseword': abi=[w[::-1] for w in n] abi.sort() n=[w[::-1] for w in abi] t=[] for w in n: t.append([w, m[w]]) if key=='count': from operator import itemgetter t.sort(key=itemgetter(1)) if desc: t.reverse() return t security.declareProtected(perm_manage, 'saveWordsToFile') def saveWordsToFile(self): """ save words to file - one per line """ f = open('/tmp/wordlist.txt', 'w') for k in self._word_count.keys(): try: f.write(k.encode('iso-8859-15')+'\r\n') except UnicodeEncodeError: print k f.close() return "done" def getUndecided(self): start = self.REQUEST.get('start', 0) end = self.REQUEST.get('end', 40) filt = self.REQUEST.get('uni_startswith', '') if start < 0: start = 0 if start>=end: end = start + 40 if not filt: sr = self.sqls.getUndecided(limit=40, offset=start) else: filt = filt.replace('%', '') filt = filt.replace("'", '') modfilt = "'" if filt.startswith('*'): modfilt += '%' modfilt += filt if filt.endswith('*'): modfilt += '%' modfilt += "'" modfilt = modfilt.replace('*', '') sr = self.sqls.getUndecidedFilter(limit=40, offset=start, filter=modfilt) res = {} for r in sr: if not res.has_key(r[1]): res[r[1]] = [] res[r[1]].append({'id': r[0], 'option': r[2], 'language': r[3]}) return res def nOfUndecided(self): return self.sqls.getNofUndecided()[0][0] def nOfMorf(self): return self.sqls.getNofWords()[0][0] def getNewStart(self): # for previous link s = self.REQUEST.get('start', 0) if s != 0: s -= 40 if s < 0: s = 0 return s def getNewEnd(self): # for next link s = self.REQUEST.get('start', None) e = self.REQUEST.get('end', None) r = 40 if e is None and s is not None: r = s+40 return r security.declareProtected(perm_manage, 'unifyWords') def unifyWords(self, REQUEST): """ unify words """ for k in REQUEST.form.keys(): if k.endswith('_custom') or k.endswith('_alias') or k == 'unifyButton' or k == 'start': continue v = REQUEST.get(k) if k.startswith('option_'): # tick option with a ID in undecided table self.sqls.tickOption(id=v) else: # unik is a word # v is an action to be done unik = unicode(k, 'utf-8') morf = unicode(REQUEST.get(k+'_'+v), 'utf-8') # _cursom if morf[0] == '?': morf[1:] if v == 'custom': self.sqls.tickWithData(word=unik, option=morf) elif v == 'alias': # add alias self.sqls.addAlias(fromword=unik, toword=morf) else: raise 'unknown stuff!', REQUEST return REQUEST.RESPONSE.redirect(self.absolute_url()+'?sunif=1') def getDocumentBackrefs(self, word): # get word info w = unicode(word, 'utf-8') res = [] sr = self.sqls.getDocWordStats(word=w) for r in sr: did = r[0] dc = r[1] title = r[2] if len(title.strip()) == 0: title = did res.append([did, dc, title]) return res def getWordCount(self, word): # get wordcount w = unicode(word, 'utf-8') return self.sqls.getWordCount(word=word)[0][0] def getTotalWordCount(self): return self.sqls.getTotalWordCount()[0][0] def getIsAlias(self, word): refs = self.sqls.getRefFrom(word=word) if len(refs) > 0: return [ r[0] for r in refs ] return False def getHasAlias(self, word): refs = self.sqls.getRefTo(word=word) if len(refs) > 0: return [ r[0] for r in refs ] return False def getWordsByLemma(self, lemma): # return words with a same lemma lems = self.sqls.wordsByLemma(lemma=lemma) r = [] for l in lems: r.append(l[0]) return r def getWordsByDocrefLemma(self, lemma): "getwordsbydocrefslemma" # return words with a same lemma lems = self.sqls.wordsByDocrefLemma(lemma=lemma) r = [] for l in lems: r.append(l[0]+' ('+str(l[1])+')') return r def getLemmaByDocrefWord(self, word): "lemmabydocrefsword" m=self.sqls.getLemmaByDocrefWord(word=word) for row in m: if row[0]: return row[0] return 'pole' def getMorfo(self, text): "morfo" import urllib params=urllib.urlencode({'tekst': text}) addr="http://evkk.tlu.ee/Search/morfoVastus?"+params f=urllib.urlopen(addr) s=f.read() return s def getTextLemmas(self, text=""): "lemmad" if text=="" and self.REQUEST.SESSION.get("lemmas", None): return self.REQUEST.SESSION.get("lemmas") if text=="": text=self.textFromSession() s=self.getMorfo(text) m=s.strip().split("\n") t={} abi=[] i=0 while i0: if m[i][0]=='"' and i>0 and i"+tree.getXML() if reverse: return tree.getReverseString() return tree.getString() def getDocrefsSyllableTreeText(self, type='string', reverse=0): "Tree from user data" tree=self.calculateSyllableTree([r[1] for r in self.sqls.getDocrefsSyllables()], reverse=reverse) if type=="xml": self.REQUEST.RESPONSE.setHeader('Content-type', 'text/xml') return ""+tree.getXML() if reverse: return tree.getReverseString() return tree.getString() def katse25(self): "joujou" return len(self.sqls.getDocrefsSyllables()) class SyllableTreeNode: "node" def __init__(self, value, level): self.value=value self.children={} self.count=0 self.level=level def addSyllables(self, syllables): self.count+=1 logger.info(syllables) if(syllables): if syllables[0] not in self.children.keys(): self.children[syllables[0]]=SyllableTreeNode(syllables[0], self.level+1) self.children[syllables[0]].addSyllables(syllables[1:]) def getString(self): t=self.level*" "+self.value+" ("+str(self.count)+")\n" m=self.children.values() m=sorted(m, key=lambda c: -c.count) for c in m: t+=c.getString() # for c in self.children.keys(): # t+=self.children[c].getString() return t def getReverseString(self): m=self.children.values() m=sorted(m, key=lambda c: -c.count) t="" for c in m: t+=c.getReverseString() t+=(10-self.level)*" "+self.value+" ("+str(self.count)+")\n" return t def getXML(self): t=self.level*" "+"\n" m=self.children.values() m=sorted(m, key=lambda c: -c.count) for c in m: t+=c.getXML() t+="" return t InitializeClass(WordTree)