import re import copy class PipeIter(object): def __init__(self, obj): self.par = obj self._itercount = -1 def next(self): if not len(self.par._ds) or self._itercount == len(self.par._ds) - 1: raise StopIteration self._itercount += 1 return self.par._ds[self._itercount] class Pipe(object): def __init__(self): self._ds = [] self._rulematches = [] self._ruleparts = [] self._partlength = [] self.skipped = False self.partcount = 0 self.onlyfirstpart=True self.skipcomments=[] def __iter__(self): return PipeIter(self) def __str__(self): return ' '.join([ w.getWord().encode('utf-8') for w in self ]) def append(self, o): self._ds.append(o) def match(self, rule, rules): tmp = [] r = False for i, w in enumerate(self): if w.hasMarking(' CLB '): r = self._match(rule, rules, tmp) if self.onlyfirstpart: return r if r: break tmp = [] tmp.append(w) return r def _match(self, rule, rules, sentence): skiplist = rules.skip_list restart = rules.restart_list allowed_look_backs = rules.allowed_lookbacks self.partcount = 0 r = False rule = copy.copy(rule) rule_bak = copy.copy(rule) if self.isSkipped(): return r doContinue = False prev_rp = None #print "rule:", rule d = False if self.documentid == 'doc_441779960552_item': d = True if d: print "algus!" if d: print self if d: print "reegel:", rule self.partlength = 0 self.coef = float(0) find_clb = False rule_length = len(rule) #for i, w in enumerate(self): for i, w in enumerate(sentence): #print "xxx", i, w.getInfo() if (i>2 and w.infoContains("@FMV")): print "kahtlane lause " if w.isIgnored(): if d: print "jatan vahele", i # None prev_rp ? if rules.null_prev_on_skip: prev_rp = None continue self.partlength = self.partlength + 1 if not find_clb: doRestart = False for remark in restart: if w.hasMarking(' CLB '): if d: print "clb asi" rule = copy.copy(rule_bak) self.partcount = self.partcount + 1 doRestart = True break if doRestart: self.partlength = 0 continue rp = rule.pop(0) if d: print "rule:", rp if not w.hasMarking(rp, skiplist): if d: print prev_rp if rp in allowed_look_backs and prev_rp in allowed_look_backs: if d: print "doing prev matching!" if prev_rp and not w.hasMarking(prev_rp, skiplist): rule.append(' ') break elif prev_rp is None: break else: # break out - no match! rule.append(' ') break else: if d: print "match!", i pass prev_rp = rp if len(rule) == 0: #print "Partlength:", self.partlength, rule_length find_clb = True #break else: if w.hasMarking(' CLB '): #print "Partlength 2:", self.partlength if self.onlyfirstpart: break if len(rule) == 0: self.coef = float(rule_length)/float(self.partlength) if d: print "rule match:", rule_bak r = True return r def addRuleMatch(self, rid): self._rulematches.append(rid) self._ruleparts.append(self.partcount) self._partlength.append(self.coef) def isExistingRule(self, rule_list, rule): "is in list?" return rule in rule_list.values() def getPossibleRule(self, conf, debug=False): """ searchables - a list of allowed marks """ all_rules = [] rule = [] searchables = conf.needed skiplist = conf.skip_list restart = conf.restart_list if self.isSkipped(): return [rule] for i, w in enumerate(self): if debug: print w.getInfo(), if w.isIgnored(): if debug: print " IGNORED" continue else: if debug: print "" has_mark = False for remark in restart: if w.hasMarking(remark): # if self.isExistingRule(conf.rules, rule): # print rule, " juba olemas" # else: all_rules.append(rule) # print "lisati ", rule rule = [] has_mark = True return all_rules #1. osalause puhul if not has_mark: for s in searchables: if w.hasMarking(s, skiplist): if s in conf.allowed_lookbacks and len(rule) > 0: if s != rule[-1]: rule.append(s) else: rule.append(s) has_mark = True break if not has_mark and len(rule) > 0: if debug: print "show-stopper:", w.getWord(), w.getInfo() break return all_rules #re.sub(' +', ' ', rule) def skipMe(self): self.skipped = True def isSkipped(self): return self.skipped def checkFullstop(self, rlist): found = False for w in self: for infos in w.getInfo(): for r in rlist['marks']: if r in infos: self.skipMe() self.skipcomments.append("mark "+r) found = True break if found: break for w in self: for rw in rlist['words']: if w.getWord().lower() == rw: self.skipMe() self.skipcomments.append("word "+rw) break break def checkIgnoredWords(self, wlist): for w in self: if w.getWord().lower() in wlist: w.setIgnored(True) def checkIgnoredMarks(self, ignlist, neededlist, skiplist): for w in self: if w.isIgnored(): continue for infos in w.getInfo(): found = False for need in neededlist: has = w.hasMarking(need) if has: if skiplist.has_key(need): m2 = skiplist.get(need) for m in m2: if w.hasMarking(m): found = True else: found = True break if not found: for ign in ignlist: has = w.hasMarking(ign) if has: w.setIgnored(True) def checkIgnoredWithMark(self, checklist): for wi, w in enumerate(self): if w.isIgnored(): continue for k, ignv in checklist.items(): for infos in w.getInfo(): if k in infos: for v in ignv['wordlist']: if v == w.getWord().lower() and wi not in ignv['ignore_ignore_pos']: w.setIgnored(True) #print "ignooring:", k, w.getWord() def checkFmvPosition(self): if self.isSkipped(): return pos=0 for wi, w in enumerate(self): if w.hasMarking("CLB"): if self.onlyfirstpart: return if not w.isIgnored(): if not w.hasMarking("@NEG"): pos=pos+1 if w.hasMarking("CLB"): pos=0 if w.hasMarking("@FMV"): if pos>2: #self.skipMe() self.skipcomments.append("fmv taga") #print "fmv taga ", self def checkPrdPosition(self): if self.isSkipped(): return pos=-1 for wi, w in enumerate(self): if not w.isIgnored(): if w.hasMarking("@PRD"): pos=wi continue if w.getWord()==".": continue if w.hasMarking("CLB"): if pos>=0: if pos!=wi-1: #print "PRD not in part end" #self.skipMe() self.skipcomments.append("PRD not in part end"); return if not pos==-1: pos=-2 if pos==-2 and not self.isSkipped(): #print "PRD not in end ", self, pos self.skipMe() self.skipcomments.append("PRD not in end") def checkFcvFmv(self): if self.isSkipped(): return found=False for wi, w in enumerate(self): if w.hasMarking("@FCV") or w.hasMarking("@FMV"): found=True if w.hasMarking("CLB"): if not found: #if not self.isSkipped(): #print "FCV or FMV not found before CLB ", self #self.skipMe() self.skipcomments.append("FCV or FMV not found before CLB") found=False if self.onlyfirstpart: return if not found: #if not self.isSkipped(): #print "FCV or FMV not found ", self #self.skipMe() self.skipcomments.append("FCV or FMV not found ") def checkImvPosition(self): if self.isSkipped(): return pos=-1 for wi, w in enumerate(self): if not w.isIgnored(): if w.hasMarking("@IMV"): pos=wi continue if w.getWord()==".": continue if w.hasMarking("CLB"): if pos>-1 and not wi-1==pos: #print "IMV is not in part end" #self.skipMe() self.skipcomments.append("IMV is not in part end") if self.onlyfirstpart: return pos=-1 continue if not pos==-1: pos=-2 if pos==-2: # and not self.isSkipped(): #print "IMV not in end ", self, pos #self.skipMe() self.skipcomments.append("IMV not in end ") def eelSona(self, sona): if sona.hasMarking("@NN"): if sona.hasMarking("el"): return 1 if sona.hasMarking("abes"): return 1 if sona.hasMarking("kom"): return 1 return 0 def checkSLopus(self): #kujul el/abes/kom koos @NN + vahesona + S sobib=1 koht=len(self._ds)-1 if koht<0: return 0 if self._ds[koht].hasMarking("Fst"): koht=koht-1 if koht<0: return 0 if self._ds[koht].hasMarking("S"): koht=koht-1 if koht<0: return if self.eelSona(self._ds[koht]): #print "jah kohe" return 1 else: koht=koht-1 if self.eelSona(self._ds[koht]): #print "jah parast" return 1 #print "pole" #print self._ds[koht].getWord() # print len(self._ds), self._ds[-2].getWord() return 0 sonad=[] for w in enumerate(self): sonad.append(w) print w.getWord() s=sonad[-1] #print s if sonad[-1].hasMarking("CLB"): print "punkti lopp" if sonad[-1].hasMarking("S"): print "N lopp" #if eelSona(sonad[-1]) or eelSona(sonad[-2]): #print "nimisona lopus "+str(self) #print "lause ots" def checkContainsNumber(self): import re p=re.compile(".*[0-9]+.*") for wi, w in enumerate(self): if p.match(w.getWord()): self.skipcomments.append("contains number") self.skipMe() def checkWordCorrect(self, word): import urllib params=urllib.urlencode({'doc': word.encode('utf-8'), 'out':'M'}) addr="http://www.filosoft.ee/html_speller_et/html_spell.cgi?"+params f=urllib.urlopen(addr) s=f.read() if s.find("red 2px")>0: #print "tundmatu "+word.encode('utf-8')+" ", return 0 else: return 1 def checkWordsCorrect(self): for wi, w in enumerate(self): if self.checkWordCorrect(w.getWord())==0: self.skipcomments.append(w.getWord()+ " tundmatu") self.skipMe() class Word(object): def __init__(self, w): self.word = w self.ignored = False self.inf = [] def __str__(self): if self.isIgnored(): ignf = '1' else: ignf = '0' return " %s %-20s %s" % (ignf, self.word, self.inf) def addInfo(self, inf): self.inf.append(inf) def getInfo(self): return self.inf def infoContains(self, s): try: self.inf.index(s) return 1 except: return 0 def getWord(self): return self.word def isIgnored(self): return self.ignored def setIgnored(self, ign): assert isinstance(ign, bool) self.ignored = ign def hasMarking(self, mark, skiplist=None): found = False for i in self.inf: if mark in i: found = True break if skiplist and found: if skiplist.has_key(mark): is_in_skip_list = False for sm in skiplist.get(mark): for i in self.inf: if sm in i: is_in_skip_list = True if is_in_skip_list: found = False return found class RulesIter(object): def __init__(self, par): self.par = par self.c = -1 def next(self): self.c = self.c + 1 if self.c == len(self.par._rids): raise StopIteration return [self.par._rids[self.c], self.par._rules[self.par._rids[self.c]]] class RulesList(object): def __init__(self, _rlist): self._statistics = {} self._statcoef={} self._rids = [] for x in _rlist: self._rids.append(x.keys()[0]) self._rawrules = _rlist self._rules = {} for x in _rlist: self._rules[x.keys()[0]] = x[x.keys()[0]] def __iter__(self): return RulesIter(self) def printStats(self): print s = self._statistics.items() s.sort(reverse=True, key=lambda x:x[1]) total = 0 for x in s: print str(x[1]).ljust(5), self._rules[x[0]], self._statcoef[x[0]] total += x[1] print "Total rule instances matched", total def get(self, rid): return self._rules[rid] def addStat(self, rid, coef=-1): if not self._statistics.has_key(rid): self._statistics[rid] = 0 self._statistics[rid] = self._statistics[rid] + 1 if not self._statcoef.has_key(rid): self._statcoef[rid]=[0, 0, 0, 0, 0] if(coef==1): self._statcoef[rid][0]=self._statcoef[rid][0]+1 elif(coef>=0.8): self._statcoef[rid][1]=self._statcoef[rid][1]+1 elif(coef>=0.6): self._statcoef[rid][2]=self._statcoef[rid][2]+1 elif(coef>=0.4): self._statcoef[rid][3]=self._statcoef[rid][3]+1 else: self._statcoef[rid][4]=self._statcoef[rid][4]+1 if __name__ == '__main__': p = Pipe() print [ x for x in p ] p.append(1) p.append(2) l = getattr(p, 'append') print l print [ x for x in p ] for x in p: for y in p: print x, y print 1 in p w = Word('asd') w in ['1','2','3']