#!/usr/bin/python2.4 # ~*~ coding: utf-8 """ * statistika mitu asja üks reegel leidis. * kookida korpusest välja need laused mis on märgendatud kui sõnajärje viga * märgendatud lausetest moodustada reeglid.. * osalause pikkus ja reegli pikkus. * ['@ADVL', '@FMV', '@SUBJ'] - otsib ja leiab """ import sys import subprocess import re from twisted.enterprise import adbapi from twisted.internet import reactor import objects import rules #from possible import rules as dyn_rules dbpool = adbapi.ConnectionPool('psycopg2', user='evkk', database='evkktest') def getSentences(txn, label): #txn.execute("select documentid, sentences.id, sentence, analysis from sentences left join tagged using (documentid) left join tags on (tags.id=tagged.tag) where label='analüüsida' and not deleted and sentences.id=78145 order by sentences.id") #txn.execute("select documentid, sentences.id, sentence, analysis from sentences left join tagged using (documentid) left join tags on (tags.id=tagged.tag) where label='analüüsida' and not deleted order by sentences.id") txn.execute("select documentid, sentences.id, sentence, analysis from sentences left join tagged using (documentid) left join tags on (tags.id=tagged.tag) where label='"+label+"' and not deleted order by sentences.id") res = txn.fetchall() return res def getSentencesOneDoc(txn, docid): txn.execute("select documentid, sentences.id, sentence, analysis from sentences left join tagged using (documentid) left join tags on (tags.id=tagged.tag) where documentid='"+docid+"' and not deleted order by sentences.id") res = txn.fetchall() return res def getSentencesFromFile(txn, fname): res = [] f = open(fname, 'rb') contents = f.read() f.close() k = re.findall('(".*?<\/s>")', contents, re.S) # documentid, sentences.id, sentence, analysis for l in k: tmp = [] tmp.append('-') tmp.append('-') tmp.append('-') tmp.append(l) res.append(tmp) return res def _parse_analysis(txt): lines = txt.split('\n') words = objects.Pipe() for l in lines: l = l.decode('utf-8') if not l: mi = None continue if l == '""' or l == '""': mi = None continue if l.startswith('"'): mi = objects.Word(l[2:-2]) #mi = {'word': l[2:-2], 'inf': [], 'ignore': False} words.append(mi) if l.startswith('\t'): #mi['inf'].append(l.replace('"'+mi['word']+'"', '').strip()) mi.addInfo(l.strip()) return words def processSentences(res, generate): all = [] print "lauseid "+str(len(res)) #print "ressursid:", res for r in res: a = r[3] #print r[2] secount = a.count('') if a.count('') != 1: #raise Exception('oioi', str(secount)) continue words = _parse_analysis(a) words.documentid = r[0] words.checkFullstop(rules.full_stop) words.checkIgnoredWords(rules.allowed_ignores_words) words.checkIgnoredMarks(rules.allowed_ignores_marks, rules.needed, rules.skip_list) words.checkIgnoredWithMark(rules.conditional_ignore) #words.checkSLopus() words.checkWordsCorrect() words.checkContainsNumber() #if words.isSkipped(): continue words.checkFmvPosition() words.checkPrdPosition() words.checkImvPosition() words.checkFcvFmv() #for w in words: # print w #dyn_rules = objects.RulesList([{1: ['@SUBJ', '@FMV', '@ADVL']}]) matches=[] matchedrules=[] maxrulelength=0 maxrule="puudub" maxruleid=0 for rid, rule in dyn_rules: if words.isSkipped(): continue #print rule match = words.match(rule, rules) if match: matches.append(rid) matchedrules.append(rule) if len(rule)>maxrulelength: maxrulelength=len(rule) maxrule=rule maxruleid=rid if matches: # print "pikim: ", maxruleid, maxrule, maxrulelength rid=maxruleid words.addRuleMatch(rid) dyn_rules.addStat(rid, words.coef) lisa="" if words.isSkipped(): lisa=" skipped " + str(words.skipcomments) print lisa, "statreegel ", match, words.coef, rid, rule, words #print "reeglid: ", str(matches) all.append(words) print "#", print for a in all: if a.isSkipped(): print "skipped:", a, str(a.skipcomments) else: print "korras: ", a, str(a._rulematches), str(a._ruleparts), str(a._partlength), str(a.skipcomments) print "Sentences:", len(all) # for a in all: # print # print a, "skipped: ", a.isSkipped() # for rmi, rm in enumerate(a._rulematches): # print " * "+str(dyn_rules.get(rm)) # if a._ruleparts[rmi] > 0: # print "Partcount:", a._ruleparts[rmi] # print "Coef:", a._partlength[rmi] # if len(a._rulematches) == 0: # for w in a: # print w.getInfo() # if w.isIgnored(): # print "ignored" # if generate: #if 1: # print "Generated rules:" # try: # all_pose = a.getPossibleRule(rules, debug=1) # print "koik ", str(all_pose) # for pose in all_pose: # if len(pose) > 1: # print "leiti ", pose # print dyn_rules # if pose in dyn_rules: # print "olemas" # except Exception, e: # print "ERROR:", str(e) # print " did:"+a.documentid print "olemasolevate vastavused:" dyn_rules.printStats() if generate: print "\n" print "Possible rules" print new_count = {} pose_map = {} new_f = open('possible.py', 'w') for a in all: if not a.isSkipped(): if len(a._rulematches) == 0: # print "reeglitu: ", a try: all_pose = a.getPossibleRule(rules) for pose in all_pose: if len(pose) > 1: posekey = '\t'.join(pose) pose_map[posekey] = pose if new_count.has_key(posekey): new_count[posekey] = new_count[posekey] + 1 else: new_count[posekey] = 1 except Exception, e: print "ERROR:", str(e) news = new_count.items() news.sort(key=lambda x: x[1], reverse=True) # print "uued ", str(news) print "uued:" total = 0 print >> new_f, """import objects dyn_rules = objects.RulesList([""" for i, n in enumerate(news): print n[1], n[0] total += n[1] print >> new_f, "\t{"+str(i+307)+": "+str(pose_map[n[0]])+"}", if len(news) - 1 != i: print >> new_f, "," else: print >> new_f print >> new_f, "])" print "Total possible new:", total new_f.close() # f = open('possible.py', 'rb') # contents = f.read() # f.close() # print contents reactor.callLater(1, reactor.stop) def start(sourcemethod, argum, generate): x = dbpool.runInteraction(sourcemethod, argum) x.addCallback(processSentences, generate) sm = getSentences argum = 'kontroll' generate = False rules_file = 'rules' has_source = False has_label = False has_docid = False if len(sys.argv) > 1: for x in sys.argv: if x.startswith('--source='): sm = getSentencesFromFile argum = x.split('=')[1] has_source = True if x.startswith('--documentid='): sm = getSentencesOneDoc argum = x.split('=')[1] has_docid = True if x.startswith('--label='): argum = x.split('=')[1] has_label = True if x.startswith('--generate'): generate = True if x.startswith('--rules='): rules_file = x.split('=')[1] if rules_file.endswith('.py'): rules_file = rules_file[:-3] if has_source and has_label: print "cannot use source and label at once!" sys.exit() #from rules import rules as dyn_rules dyn_rules = __import__(rules_file, [], [], 'rules') dyn_rules = dyn_rules.rules start(sm, argum, generate) reactor.run()