#!/usr/bin/python2.4
# ~*~ coding: utf-8
"""
 * statistika mitu asja üks reegel leidis.
 * kookida korpusest välja need laused mis on märgendatud kui sõnajärje viga
 * märgendatud lausetest moodustada reeglid..
 * osalause pikkus ja reegli pikkus.
 * ['@ADVL', '@FMV', '@SUBJ'] - otsib ja leiab
"""
import sys
import subprocess
import re
from twisted.enterprise import adbapi
from twisted.internet import reactor

import objects
import rules
#from possible import rules as dyn_rules

dbpool = adbapi.ConnectionPool('psycopg2', user='evkk', database='evkktest')

def getSentences(txn, label):
    #txn.execute("select documentid, sentences.id, sentence, analysis from sentences left join tagged using (documentid) left join tags on (tags.id=tagged.tag) where label='analüüsida' and not deleted and sentences.id=78145 order by sentences.id")
    #txn.execute("select documentid, sentences.id, sentence, analysis from sentences left join tagged using (documentid) left join tags on (tags.id=tagged.tag) where label='analüüsida' and not deleted order by sentences.id")
    txn.execute("select documentid, sentences.id, sentence, analysis from sentences left join tagged using (documentid) left join tags on (tags.id=tagged.tag) where label='"+label+"' and not deleted order by sentences.id")
    res = txn.fetchall()
    return res

def getSentencesOneDoc(txn, docid):
    txn.execute("select documentid, sentences.id, sentence, analysis from sentences left join tagged using (documentid) left join tags on (tags.id=tagged.tag) where documentid='"+docid+"' and not deleted order by sentences.id")
    res = txn.fetchall()
    return res

def getSentencesFromFile(txn, fname):
    res = []
    f = open(fname, 'rb')
    contents = f.read()
    f.close()
    k = re.findall('("<s>.*?<\/s>")', contents, re.S)
    # documentid, sentences.id, sentence, analysis
    for l in k:
        tmp = []
        tmp.append('-')
        tmp.append('-')
        tmp.append('-')
        tmp.append(l)
        res.append(tmp)
    return res

def _parse_analysis(txt):
    lines = txt.split('\n')
    words = objects.Pipe()
    for l in lines:
        l = l.decode('utf-8')
        if not l:
            mi = None
            continue
        if l == '"<s>"' or l == '"</s>"':
            mi = None
            continue
        if l.startswith('"'):
            mi = objects.Word(l[2:-2])
            #mi = {'word': l[2:-2], 'inf': [], 'ignore': False}
            words.append(mi)
        if l.startswith('\t'):
            #mi['inf'].append(l.replace('"'+mi['word']+'"', '').strip())
            mi.addInfo(l.strip())
    return words

def processSentences(res, generate):
    all = []
    print "lauseid "+str(len(res))
    #print "ressursid:", res
    for r in res:
        a = r[3]
        #print r[2]
        secount = a.count('<s>')
        if a.count('<s>') != 1: 
            #raise Exception('oioi', str(secount))
            continue
        words = _parse_analysis(a)
        words.documentid = r[0]

        words.checkFullstop(rules.full_stop)
        words.checkIgnoredWords(rules.allowed_ignores_words)
        words.checkIgnoredMarks(rules.allowed_ignores_marks, rules.needed, rules.skip_list)
        words.checkIgnoredWithMark(rules.conditional_ignore)
        #words.checkSLopus()
        words.checkWordsCorrect()
        words.checkContainsNumber()

        #if words.isSkipped(): continue
        words.checkFmvPosition()
        words.checkPrdPosition()
        words.checkImvPosition()
        words.checkFcvFmv()
        #for w in words:
        #    print w
        #dyn_rules = objects.RulesList([{1: ['@SUBJ', '@FMV', '@ADVL']}])
        matches=[]
        matchedrules=[]
        maxrulelength=0
        maxrule="puudub"
        maxruleid=0
        for rid, rule in dyn_rules:
            if words.isSkipped(): continue
            #print rule
            match = words.match(rule, rules)
            if match:
                matches.append(rid)
                matchedrules.append(rule)
                if len(rule)>maxrulelength:
                     maxrulelength=len(rule)
                     maxrule=rule
                     maxruleid=rid
        if matches:
               # print "pikim: ", maxruleid, maxrule, maxrulelength
                rid=maxruleid
                words.addRuleMatch(rid)
                dyn_rules.addStat(rid, words.coef)
                lisa=""
                if words.isSkipped(): lisa=" skipped " + str(words.skipcomments)
                print lisa, "statreegel ", match, words.coef, rid, rule, words
        #print "reeglid: ", str(matches)
        all.append(words)
        print "#",
    print
    for a in all:
        if a.isSkipped():
            print "skipped:", a, str(a.skipcomments)
        else:
            print "korras: ", a, str(a._rulematches), str(a._ruleparts), str(a._partlength), str(a.skipcomments)

    print "Sentences:", len(all)
#    for a in all:
#        print 
#        print a, "skipped: ", a.isSkipped()
#        for rmi, rm in enumerate(a._rulematches):
#            print " * "+str(dyn_rules.get(rm))
#            if a._ruleparts[rmi] > 0:
#                print "Partcount:", a._ruleparts[rmi]
#            print "Coef:", a._partlength[rmi]
#        if len(a._rulematches) == 0:
#            for w in a:
#                print w.getInfo()
#                if w.isIgnored():
#                   print "ignored"
#        if generate:
        #if 1:
#            print "Generated rules:"
#            try:
#                all_pose = a.getPossibleRule(rules, debug=1)
#                print "koik ", str(all_pose)
#                for pose in all_pose:
#                    if len(pose) > 1:
#                        print "leiti ", pose
#                        print dyn_rules
#                        if pose in dyn_rules: 
#                           print "olemas"
#            except Exception, e:
#                print "ERROR:", str(e)
#        print " did:"+a.documentid

    print "olemasolevate vastavused:"
    dyn_rules.printStats()


    if generate:
        print "\n"
        print "Possible rules"
        print
        new_count = {}
        pose_map = {}
        new_f = open('possible.py', 'w')
        for a in all:
         if not a.isSkipped():
            if len(a._rulematches) == 0:
#                print "reeglitu: ", a
                try:
                    all_pose = a.getPossibleRule(rules)
                    for pose in all_pose:
                        if len(pose) > 1:
                            posekey = '\t'.join(pose)
                            pose_map[posekey] = pose
                            if new_count.has_key(posekey):
                                new_count[posekey] = new_count[posekey] + 1
                            else:
                                new_count[posekey] = 1
                except Exception, e:
                    print "ERROR:", str(e)
        news = new_count.items()
        news.sort(key=lambda x: x[1], reverse=True)
#        print "uued ", str(news)
        print "uued:"
        total = 0
        print >> new_f, """import objects
dyn_rules = objects.RulesList(["""
        for i, n in enumerate(news):
            print n[1], n[0]
            total += n[1]
            print >> new_f, "\t{"+str(i+307)+": "+str(pose_map[n[0]])+"}",
            if len(news) - 1 != i:
                print >> new_f, ","
            else:
                print >> new_f
        print >> new_f, "])"
        print "Total possible new:", total
        new_f.close()

#        f = open('possible.py', 'rb')
#        contents = f.read()
#        f.close()
#        print contents

    reactor.callLater(1, reactor.stop)

def start(sourcemethod, argum, generate):
    x = dbpool.runInteraction(sourcemethod, argum)
    x.addCallback(processSentences, generate)

sm = getSentences
argum = 'kontroll'
generate = False
rules_file = 'rules'

has_source = False
has_label = False
has_docid = False

if len(sys.argv) > 1:
    for x in sys.argv:
        if x.startswith('--source='):
            sm = getSentencesFromFile
            argum = x.split('=')[1]
            has_source = True
        if x.startswith('--documentid='):
            sm = getSentencesOneDoc
            argum = x.split('=')[1]
            has_docid = True
        if x.startswith('--label='):
            argum = x.split('=')[1]
            has_label = True
        if x.startswith('--generate'):
            generate = True
        if x.startswith('--rules='):
            rules_file = x.split('=')[1]
            if rules_file.endswith('.py'):
                rules_file = rules_file[:-3]
if has_source and has_label:
    print "cannot use source and label at once!"
    sys.exit()

#from rules import rules as dyn_rules
dyn_rules = __import__(rules_file, [], [], 'rules')
dyn_rules = dyn_rules.rules
start(sm, argum, generate)
reactor.run()