#!/usr/bin/env python
# coding: utf-8

# In[1]:


# In[2]:


import os
import pandas as pd
import sklearn
import numpy as np

import re
import glob

import matplotlib.pyplot as plt


CWD = os.getcwd()
INPUT_FOLDER = CWD + '\input_texts'
# "/*.txt" t2rn on oluline - loeme sisse koik failid
all_files = glob.glob(INPUT_FOLDER + "/*.txt")


nr_of_docs = len(all_files)
print("Tekste sisse loetud >> " + str(nr_of_docs))

li = []
for filename in all_files:
    item = []
    doc_id = filename.rsplit('\\', 1)[1]
    item.append(doc_id)
    
    with open(filename, 'r', encoding="utf8") as f:
        text = f.read()
    item.append(text)
    li.append(item)
    
df = pd.DataFrame(li, columns = ['doc_id', 'text'])
df['text'] = df['text'].astype(str)


# In[3]:


df.head()


# In[4]:


stopwords_file = CWD + "\estonian-stopwords.txt"
with open(stopwords_file, 'r', encoding='utf8') as f:
    STOPWORDS = f.read()
    STOPWORDS = STOPWORDS.replace('\n', ' ')
    STOPWORDS = STOPWORDS.split()
    STOPWORDS = set(STOPWORDS)


# In[5]:


#STOPWORDS


# In[6]:


# nyyd oleme j6udnud sisendteksti sammu juurest j2rgmise sammuni - eelt88tlus
# re.compile teeb 
# TODO check re compile list
# TODO kass strip ! , . ? jne
def text_prepare(text, STOPWORDS):
    #replace_by_space_re = re.compile('\ |\?|\.|\!|\/|\;|\:|\n|\r')
    replace_by_space_re = re.compile('[\n\"\'/(){}\[\]\|@,;#.]')
    text = re.sub(replace_by_space_re, ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.lower()
    
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
    
    #to_replace_list = ['\\n', '&quot', '\\xa0']    
    #for element in to_replace_list:
    #    text.replace(element, ' ')  
    
    text = text.strip()
    return text


# In[7]:


# lisame oma dataframele uue tyhja tulba 
df['clean_text'] = ''
df.head()


# In[22]:


df['clean_text'] = df['text'].apply(lambda x: text_prepare(x, STOPWORDS))
df.head()


# In[9]:


df.iloc[0].clean_text[0:400]


# In[10]:


def show_wordcloud(col):
    from wordcloud import WordCloud
    wordcloud = WordCloud().generate(' '.join(df[col]))
    plt.figure(figsize = (5, 4), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.tight_layout(pad = 0)
    plt.show()


# In[11]:


show_wordcloud('text')


# In[12]:


show_wordcloud('clean_text')


# In[13]:


# nyyd oleme valmise teksti eelt88tluse sammuga, saab endasi minna vektoriseerimise juurde
#
#max_df
#When building the vocabulary ignore terms that have a document 
#frequency strictly higher than the given threshold (corpus-specific stop words). 
#
#min_df
#When building the vocabulary ignore terms that have a 
#document frequency strictly lower than the given threshold. 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_df=0.7, min_df = 3)
tf_idf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])


# In[14]:


tf_idf_matrix


# In[15]:


tf_idf_matrix[0]


# In[16]:


#
#test_features = tfidf_vectorizer.get_feature_names_out(tf_idf_matrix)


# In[20]:


example_df = pd.DataFrame(tf_idf_matrix[3].T.todense(), 
                          index=tfidf_vectorizer.get_feature_names(), columns=["TF-IDF"])
example_df = example_df.sort_values('TF-IDF', ascending=False)
print (example_df.head(15))


# In[ ]:


# tekstide v6rdlemine
# kui on suurem kogus tekste, on parem v6rrelda tekstide cosine_similarity


# In[23]:


from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity_array = cosine_similarity(tf_idf_matrix[0: nr_of_docs], tf_idf_matrix)


# In[25]:


cosine_similarity_array[0:4]


# In[27]:


matches_df = pd.DataFrame(columns = [
    'similarity',
    'text0_id',
    'text1_id'   
])

for x in range(0, nr_of_docs):
    for y in range(0, nr_of_docs):
        if x != y:
            row = [
                cosine_similarity_array[x][y],
                df.iloc[x].doc_id,
                df.iloc[y].doc_id              
            ]
            row = pd.Series(row, index = matches_df.columns)
            matches_df = matches_df.append(row, ignore_index = True)


# In[30]:


len(matches_df)


# In[33]:


matches_df.head()


# In[35]:


matches_df = matches_df.sort_values(by='similarity', ascending = False)


# In[39]:


matches_df.head(10)


# In[38]:


matches_df.hist()