import os
import pandas as pd
import sklearn
import numpy as np
import re
import glob
import matplotlib.pyplot as plt

CWD = os.getcwd()
INPUT_FOLDER = CWD + '\input_texts'

# "/*.txt" t2rn on oluline - loeme sisse koik failid
all_files = glob.glob(INPUT_FOLDER + "/*.txt")
nr_of_docs = len(all_files)
print("Tekste sisse loetud >> " + str(nr_of_docs))

li = []

for filename in all_files:
    item = []
    doc_id = filename.rsplit('\\', 1)[1]
    item.append(doc_id)
    with open(filename, 'r', encoding="utf8") as f:
        text = f.read()
    item.append(text)
    li.append(item)

df = pd.DataFrame(li, columns = ['doc_id', 'text'])
df['text'] = df['text'].astype(str)

df.head()

stopwords_file = CWD + "\estonian-stopwords.txt"
with open(stopwords_file, 'r', encoding='utf8') as f:
    STOPWORDS = f.read()

STOPWORDS = STOPWORDS.replace('\n', ' ')
STOPWORDS = STOPWORDS.split()
STOPWORDS = set(STOPWORDS)

#STOPWORDS

# nyyd oleme j6udnud sisendteksti sammu juurest j2rgmise sammuni - eelt88tlus
# re.compile teeb
# TODO check re compile list
# TODO kass strip ! , . ? jne

def text_prepare(text, STOPWORDS):
    #replace_by_space_re = re.compile('\ |\?|\.|\!|\/|\;|\:|\n|\r')
    replace_by_space_re = re.compile('[\n\"\'/(){}\[\]\|@,;#.]')
    text = re.sub(replace_by_space_re, ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
    #to_replace_list = ['\\n', '"', '\\xa0']
    #for element in to_replace_list:
    #    text.replace(element, ' ')
    text = text.strip()
    return text

# lisame oma dataframele uue tyhja tulba
df['clean_text'] = ''
df.head()

df['clean_text'] = df['text'].apply(lambda x: text_prepare(x, STOPWORDS))
df.head()

df.iloc[0].clean_text[0:400]

def show_wordcloud(col):
    from wordcloud import WordCloud
    wordcloud = WordCloud().generate(' '.join(df[col]))
    plt.figure(figsize = (5, 4), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.tight_layout(pad = 0)
    plt.show()

show_wordcloud('text')

show_wordcloud('clean_text')

# nyyd oleme valmise teksti eelt88tluse sammuga, saab endasi minna vektoriseerimise juurde
#
#max_df
#When building the vocabulary ignore terms that have a document
#frequency strictly higher than the given threshold (corpus-specific stop words).
#
#min_df
#When building the vocabulary ignore terms that have a
#document frequency strictly lower than the given threshold.

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_df=0.7, min_df = 3)
tf_idf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

tf_idf_matrix

tf_idf_matrix[0]

#test_features = tfidf_vectorizer.get_feature_names_out(tf_idf_matrix)

example_df = pd.DataFrame(tf_idf_matrix[3].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["TF-IDF"])
example_df = example_df.sort_values('TF-IDF', ascending=False)
print (example_df.head(15))

# tekstide v6rdlemine
# kui on suurem kogus tekste, on parem v6rrelda tekstide cosine_similarity

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_array = cosine_similarity(tf_idf_matrix[0: nr_of_docs], tf_idf_matrix)

cosine_similarity_array[0:4]

matches_df = pd.DataFrame(columns = [ 'similarity', 'text0_id', 'text1_id' ])

for x in range(0, nr_of_docs):
    for y in range(0, nr_of_docs):
        if x != y:
            row = [
                cosine_similarity_array[x][y],
                df.iloc[x].doc_id,
                df.iloc[y].doc_id
            ]
            row = pd.Series(row, index = matches_df.columns)
            matches_df = matches_df.append(row, ignore_index = True)

len(matches_df)

matches_df.head()

matches_df = matches_df.sort_values(by='similarity', ascending = False)

matches_df.head(10)

matches_df.hist()