import re import copy class PipeIter(object): def __init__(self, obj): self.par = obj self._itercount = -1 def next(self): if not len(self.par._ds) or self._itercount == len(self.par._ds) - 1: raise StopIteration self._itercount += 1 return self.par._ds[self._itercount] class Pipe(object): def __init__(self): self._ds = [] self._rulematches = [] self._ruleparts = [] self._partlength = [] self.skipped = False self.partcount = 0 def __iter__(self): return PipeIter(self) def __str__(self): return ' '.join([ w.getWord().encode('utf-8') for w in self ]) def append(self, o): self._ds.append(o) def match(self, rule, rules): tmp = [] r = False for i, w in enumerate(self): if w.hasMarking(' CLB '): r = self._match(rule, rules, tmp) if r: break tmp = [] tmp.append(w) return r def _match(self, rule, rules, sentence): skiplist = rules.skip_list restart = rules.restart_list allowed_look_backs = rules.allowed_lookbacks self.partcount = 0 r = False rule = copy.copy(rule) rule_bak = copy.copy(rule) if self.isSkipped(): return r doContinue = False prev_rp = None #print "rule:", rule d = False if self.documentid == 'doc_441779960552_item': d = True if d: print "algus!" if d: print self if d: print "reegel:", rule self.partlength = 0 self.coef = float(0) find_clb = False rule_length = len(rule) #for i, w in enumerate(self): for i, w in enumerate(sentence): #print "xxx", i, w.getInfo() if (i>2 and w.infoContains("@FMV")): print "kahtlane lause " if w.isIgnored(): if d: print "jatan vahele", i # None prev_rp ? if rules.null_prev_on_skip: prev_rp = None continue self.partlength = self.partlength + 1 if not find_clb: doRestart = False for remark in restart: if w.hasMarking(' CLB '): if d: print "clb asi" rule = copy.copy(rule_bak) self.partcount = self.partcount + 1 doRestart = True break if doRestart: self.partlength = 0 continue rp = rule.pop(0) if d: print "rule:", rp if not w.hasMarking(rp, skiplist): if d: print prev_rp if rp in allowed_look_backs and prev_rp in allowed_look_backs: if d: print "doing prev matching!" if prev_rp and not w.hasMarking(prev_rp, skiplist): rule.append(' ') break elif prev_rp is None: break else: # break out - no match! rule.append(' ') break else: if d: print "match!", i pass prev_rp = rp if len(rule) == 0: #print "Partlength:", self.partlength, rule_length find_clb = True #break else: if w.hasMarking(' CLB '): #print "Partlength 2:", self.partlength break if len(rule) == 0: self.coef = float(rule_length)/float(self.partlength) if d: print "rule match:", rule_bak r = True return r def addRuleMatch(self, rid): self._rulematches.append(rid) self._ruleparts.append(self.partcount) self._partlength.append(self.coef) def getPossibleRule(self, conf, debug=False): """ searchables - a list of allowed marks """ all_rules = [] rule = [] searchables = conf.needed skiplist = conf.skip_list restart = conf.restart_list if self.isSkipped(): return [rule] for i, w in enumerate(self): if debug: print w.getInfo(), if w.isIgnored(): if debug: print " IGNORED" continue else: if debug: print "" has_mark = False for remark in restart: if w.hasMarking(remark): all_rules.append(rule) rule = [] has_mark = True if not has_mark: for s in searchables: if w.hasMarking(s, skiplist): if s in conf.allowed_lookbacks and len(rule) > 0: if s != rule[-1]: rule.append(s) else: rule.append(s) has_mark = True break if not has_mark and len(rule) > 0: if debug: print "show-stopper:", w.getWord(), w.getInfo() break return all_rules #re.sub(' +', ' ', rule) def skipMe(self): self.skipped = True def isSkipped(self): return self.skipped def checkFullstop(self, rlist): found = False for w in self: for infos in w.getInfo(): for r in rlist['marks']: if r in infos: self.skipMe() found = True break if found: break for w in self: for rw in rlist['words']: if w.getWord().lower() == rw: self.skipMe() break break def checkIgnoredWords(self, wlist): for w in self: if w.getWord().lower() in wlist: w.setIgnored(True) def checkIgnoredMarks(self, ignlist, neededlist, skiplist): for w in self: if w.isIgnored(): continue for infos in w.getInfo(): found = False for need in neededlist: has = w.hasMarking(need) if has: if skiplist.has_key(need): m2 = skiplist.get(need) for m in m2: if w.hasMarking(m): found = True else: found = True break if not found: for ign in ignlist: has = w.hasMarking(ign) if has: w.setIgnored(True) def checkIgnoredWithMark(self, checklist): for wi, w in enumerate(self): if w.isIgnored(): continue for k, ignv in checklist.items(): for infos in w.getInfo(): if k in infos: for v in ignv['wordlist']: if v == w.getWord().lower() and wi not in ignv['ignore_ignore_pos']: w.setIgnored(True) #print "ignooring:", k, w.getWord() def checkFmvPosition(self): pos=0 for wi, w in enumerate(self): if not w.isIgnored(): if not w.hasMarking("@NEG"): pos=pos+1 if w.hasMarking("@FMV"): if pos>2: self.skipMe() print "fmv taga ", self def checkPrdPosition(self): pos=-1 for wi, w in enumerate(self): if not w.isIgnored(): if w.hasMarking("@PRD"): pos=wi continue pos=-2 if pos==-2 and not self.isSkipped(): print "PRD not in end ", self self.skipMe() def checkFcvFmv(self): found=False for wi, w in enumerate(self): if w.hasMarking("@FCV") or w.hasMarking("@FMV"): found=True if not found: if not self.isSkipped(): print "FCV or FMV not found ", self self.skipMe() class Word(object): def __init__(self, w): self.word = w self.ignored = False self.inf = [] def __str__(self): if self.isIgnored(): ignf = '1' else: ignf = '0' return " %s %-20s %s" % (ignf, self.word, self.inf) def addInfo(self, inf): self.inf.append(inf) def getInfo(self): return self.inf def infoContains(self, s): try: self.inf.index(s) return 1 except: return 0 def getWord(self): return self.word def isIgnored(self): return self.ignored def setIgnored(self, ign): assert isinstance(ign, bool) self.ignored = ign def hasMarking(self, mark, skiplist=None): found = False for i in self.inf: if mark in i: found = True break if skiplist and found: if skiplist.has_key(mark): is_in_skip_list = False for sm in skiplist.get(mark): for i in self.inf: if sm in i: is_in_skip_list = True if is_in_skip_list: found = False return found class RulesIter(object): def __init__(self, par): self.par = par self.c = -1 def next(self): self.c = self.c + 1 if self.c == len(self.par._rids): raise StopIteration return [self.par._rids[self.c], self.par._rules[self.par._rids[self.c]]] class RulesList(object): def __init__(self, _rlist): self._statistics = {} self._statcoef={} self._rids = [] for x in _rlist: self._rids.append(x.keys()[0]) self._rawrules = _rlist self._rules = {} for x in _rlist: self._rules[x.keys()[0]] = x[x.keys()[0]] def __iter__(self): return RulesIter(self) def printStats(self): print s = self._statistics.items() s.sort(reverse=True, key=lambda x:x[1]) total = 0 for x in s: print str(x[1]).ljust(5), self._rules[x[0]], self._statcoef[x[0]] total += x[1] print "Total rule instances matched", total def get(self, rid): return self._rules[rid] def addStat(self, rid, coef=-1): if not self._statistics.has_key(rid): self._statistics[rid] = 0 self._statistics[rid] = self._statistics[rid] + 1 if not self._statcoef.has_key(rid): self._statcoef[rid]=[0, 0, 0, 0, 0] if(coef==1): self._statcoef[rid][0]=self._statcoef[rid][0]+1 elif(coef>=0.8): self._statcoef[rid][1]=self._statcoef[rid][1]+1 elif(coef>=0.6): self._statcoef[rid][2]=self._statcoef[rid][2]+1 elif(coef>=0.4): self._statcoef[rid][3]=self._statcoef[rid][3]+1 else: self._statcoef[rid][4]=self._statcoef[rid][4]+1 if __name__ == '__main__': p = Pipe() print [ x for x in p ] p.append(1) p.append(2) l = getattr(p, 'append') print l print [ x for x in p ] for x in p: for y in p: print x, y print 1 in p w = Word('asd') w in ['1','2','3']