#!/usr/bin/env python # -*- coding: utf-8 -*- USELANG = False LINEW = 80 # line width import email import html2text html2text.UNICODE_SNOB = 1 # No reason to replace unicode characters with ascii lookalikes there import GeoIP if USELANG: import guess_language import re import regexplib from name2gender import name2gender import mailbox import glob import os.path from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB try: import cPickle as pickle except ImportError: import pickle geoip = GeoIP.new(GeoIP.GEOIP_MEMORY_CACHE) words = re.compile(ur'[\wöäüõšž]+',re.UNICODE+re.IGNORECASE) SEP = u'___' def getmessagetext_plain(message): """ Returns all plaintext content in a message""" if message.get_content_type() == 'text/plain': encoding = message.get_content_charset() text = message.get_payload(decode=True) if encoding: text = text.decode(encoding,errors='ignore') else: # Let's just try to decode it, the chances are this will # work and even a text without unicode characters is better # than no text at all text = text.decode('unicode-escape',errors='ignore') return text + '\n' elif message.is_multipart(): # Parts are message too, so they can consist of parts again. They do. return '\n'.join(getmessagetext_plain(part) for part in message.get_payload()).strip('\n') else: return '' def getmessagetext_html(message): """ Returns all HTML content in a message""" if message.get_content_type() == 'text/html': encoding = message.get_content_charset() text = message.get_payload(decode=True) if encoding: text = text.decode(encoding,errors='ignore') else: text = text.decode('unicode-escape',errors='ignore') try: return html2text.html2text(text) + '\n' except: # Some html is just invalid... return '' elif message.is_multipart(): return '\n'.join(getmessagetext_html(part) for part in message.get_payload()).strip('\n') else: return '' def getmessagetext(message): """ Extracts text content from email. Parses HTML using html2text if no plaintext content is found.""" if hasattr(message,'fp'): # workaraound for Maildirmessage objects message.fp.seek(0) message = email.message_from_file(message.fp) text = getmessagetext_plain(message) if text: return text return getmessagetext_html(message) def getcontenttypes(message): if hasattr(message,'fp'): # workaraound for Maildirmessage objects message.fp.seek(0) message = email.message_from_file(message.fp) if message.is_multipart(): return [message.get_content_type()] + sum( (getcontenttypes(part) for part in message.get_payload()),[]) else: return [message.get_content_type()] def orig(text): return '\n'.join( line for line in text.splitlines() if (not line.lstrip().startswith('>')) ) def getheaders(message,header): ret = [] got = message.get_all(header) if got: for instance in got: for text, encoding in email.Header.decode_header(instance): if encoding: text = text.decode(encoding) else: text = text.decode('unicode-escape') ret.append(text) return ret def getsendergender(fromheader): L = fromheader.replace('"','').split() L = filter(lambda s: all(c.isalpha() for c in s),L) if len(L) > 2: L = filter(lambda s: all(all(c == c.lower() for c in ss[1:]) for ss in s.split('-')),L) if len(L) > 1: L = filter(lambda s: not s.endswith(','),L) if len(L) > 2: L = filter(lambda s: s[0] == s[0].upper(),L) if len(L) > 2: filter(name2gender ,L) for word in L: gender = name2gender(word) if gender: return gender def getsenderip(receivedheader): # Last address in header is nearest to the sender for candidate in reversed(regexplib.ipv4find.findall(receivedheader)): if regexplib.ipv4validate.match(candidate): return candidate def getsenderlocation(receivedheader): ip = getsenderip(receivedheader) if not ip: return {} ret = dict(country=geoip.country_name_by_addr(ip)) return ret def messageinfo(message): ret = '' ret = getmessagetext(message) + '\n\n' if USELANG: language = 'language' +SEP+ guess_language.guessLanguageName(ret) for (mark,placeholder) in [(',','comma'),('.','full_stop'),('!','exclaimationmark'),('?','questionmark')]: ret = ret.replace(mark, mark+' '+SEP + placeholder+' ') if USELANG: ret += language + '\n' for header in ['subject']: # Headers, that are also content ret = ret.rstrip() + '\n' for instance in getheaders(message,header): ret += instance + ' ' for word in words.findall(instance): ret += header + SEP + word +' ' ret += word +' ' ret = ret.rstrip() + '\n' headerinfo = set() for header in message.keys(): headerinfo.add('hasheader'+ SEP + header.replace('.','_').replace('-','_')) for header in ['sender','to','cc','x-mailer','from','importance','precedence','List-Id']: # ,'sender','to','cc','bcc']: for instance in getheaders(message,header): instance += ' '+ instance.replace('@','_').replace('.','__') if header.startswith('x-'): header = header[2:] for word in words.findall(instance): if sum(c.isalpha() for c in word) > (len(word)/3*2): headerinfo.add(header + SEP + word) receivedheaders = '\n'.join(getheaders(message,'received')) if getsenderip(receivedheaders): headerinfo.add('from_ip'+ SEP + getsenderip(receivedheaders).replace('.','_')) for k,v in getsenderlocation(receivedheaders).iteritems(): if v: headerinfo.add(u'from_location_'+k+ SEP + v.decode('utf-8').replace(' ','_')) gender = getsendergender('\n'.join(getheaders(message,'from'))) headerinfo.add('from_gender' +SEP +str(gender)) for contenttype in getcontenttypes(message): headerinfo.add('contains' + SEP + contenttype.replace('/','_')) return ret+'\n'+' '.join(headerinfo) def none(*args): return args def removeshortwords(minlength): def f(messagetexts): return [ ' '.join(w for w in message.split() if len(w) > minlength) for message in messagetexts] return f def tf(train,test): trf = TfidfTransformer(use_idf=False) trf = trf.fit(train) train = trf.transform(train) test = trf.transform(test) return train,test def tfidf(train,test): trf = TfidfTransformer() trf = trf.fit(train) train = trf.transform(train) test = trf.transform(test) return train,test def messages_from_maildir(maildir): for message in maildir: message.fp.seek(0) yield email.message_from_file(message.fp) def messages_from_path(path): paths = glob.glob(path) if len(paths) == 1 and os.path.isdir(paths[0]): paths = glob.glob(os.path.join(path,'*')) for filepath in paths: filepath = os.path.expanduser(filepath) try: with open(filepath) as f: yield email.message_from_file(f) except Exception, E: print E def sentby(userid): def sentbyuser(message): for sender in getheaders(message,'from'): if userid in sender: return True return sentbyuser def extractfeatures(messages,userid): featuretexts = [] repliedmessageids = set() for message in messages: if sentby(userid)(message): repliedmessageids.update( getheaders(message,'in-reply-to') ) repliedmessageids.update( getheaders(message,'references') ) else: featuretexts.append( messageinfo(message) ) def replied_to(messege): for message_id in getheaders(message,'message-id'): if message_id in repliedmessageids: return True return False isreplied = [] for message in messages: if not sentby(userid)(message): r = replied_to(message) isreplied.append(r) return featuretexts, isreplied def trainImportanceModel(featuretexts,targetvalues): vect = CountVectorizer() trf = TfidfTransformer(use_idf=False) clf = MultinomialNB(alpha=0.001) featurevectors = vect.fit_transform(featuretexts) tfvectors = trf.fit_transform(featurevectors) clf.fit(tfvectors,targetvalues) return (vect,trf,clf) def predictImportance((vect,trf,clf),featuretexts): featurevectors = vect.transform(featuretexts) tfvectors = trf.transform(featurevectors) predictvector = clf.predict_proba(tfvectors) predictlist = predictvector.tolist() repliedprobas = [p for (_,p) in predictlist] # the unreplied probability is redundant return repliedprobas USAGE = """usage: olulisedkirjad.py /path/to/messages /path/to/model userid [train]""" def main(): args = sys.argv[1:] if len(args) == 3: args.append('predict') if len(args) != 4: print USAGE return 1 maildirpath, modelpath, userid, action = args try: maildir = mailbox.Maildir(maildirpath) print 'Reading maildir' allmessages = list(messages_from_maildir(maildir)) print 'Maildir read.' except OSError: print 'This is not a maildir, so we will hope to just find the messages there' allmessages = list(messages_from_path(maildirpath)) if action == 'train': print 'training' received_messages_features,replied = extractfeatures(allmessages,userid) T = trainImportanceModel(received_messages_features,replied) with open(modelpath,'wb') as modelfile: pickle.dump(T,modelfile) elif action == 'predict': print 'predicting' with open(modelpath,'rb') as modelfile: T = pickle.load(modelfile) receivedmessages = filter(lambda m: not sentby(userid)(m),allmessages) received_messages_features,alreadyreplied = extractfeatures(allmessages,userid) importanceestimate = predictImportance(T,received_messages_features) importantmessages = zip(importanceestimate,receivedmessages) importantmessages.sort() importantmessages.reverse() for (importance,message) in importantmessages: headline = u'' # headline += ' '.join(getheaders(message,'message-id')) # headline += " | " fromheader = ' '.join(getheaders(message,'from')) headline += ' '.join(re.findall('<.*>',fromheader))[1:-1].strip() headline += " | " headline += ' '.join(getheaders(message,'subject')) headline = headline.replace('\n',' ') headline = headline[:LINEW-11] # room for importance headline += " | " headline += ' '*(LINEW-8 - len(headline)) headline += "%0.3g" % importance print headline if __name__ == '__main__': main()