#!/usr/bin/env python # coding: utf-8 # In[1]: # In[2]: import os import pandas as pd import sklearn import numpy as np import re import glob import matplotlib.pyplot as plt CWD = os.getcwd() INPUT_FOLDER = CWD + '\input_texts' # "/*.txt" t2rn on oluline - loeme sisse koik failid all_files = glob.glob(INPUT_FOLDER + "/*.txt") nr_of_docs = len(all_files) print("Tekste sisse loetud >> " + str(nr_of_docs)) li = [] for filename in all_files: item = [] doc_id = filename.rsplit('\\', 1)[1] item.append(doc_id) with open(filename, 'r', encoding="utf8") as f: text = f.read() item.append(text) li.append(item) df = pd.DataFrame(li, columns = ['doc_id', 'text']) df['text'] = df['text'].astype(str) # In[3]: df.head() # In[4]: stopwords_file = CWD + "\estonian-stopwords.txt" with open(stopwords_file, 'r', encoding='utf8') as f: STOPWORDS = f.read() STOPWORDS = STOPWORDS.replace('\n', ' ') STOPWORDS = STOPWORDS.split() STOPWORDS = set(STOPWORDS) # In[5]: #STOPWORDS # In[6]: # nyyd oleme j6udnud sisendteksti sammu juurest j2rgmise sammuni - eelt88tlus # re.compile teeb # TODO check re compile list # TODO kass strip ! , . ? jne def text_prepare(text, STOPWORDS): #replace_by_space_re = re.compile('\ |\?|\.|\!|\/|\;|\:|\n|\r') replace_by_space_re = re.compile('[\n\"\'/(){}\[\]\|@,;#.]') text = re.sub(replace_by_space_re, ' ', text) text = re.sub(' +', ' ', text) text = text.lower() text = ' '.join([word for word in text.split() if word not in STOPWORDS]) #to_replace_list = ['\\n', '"', '\\xa0'] #for element in to_replace_list: # text.replace(element, ' ') text = text.strip() return text # In[7]: # lisame oma dataframele uue tyhja tulba df['clean_text'] = '' df.head() # In[22]: df['clean_text'] = df['text'].apply(lambda x: text_prepare(x, STOPWORDS)) df.head() # In[9]: df.iloc[0].clean_text[0:400] # In[10]: def show_wordcloud(col): from wordcloud import WordCloud wordcloud = WordCloud().generate(' '.join(df[col])) plt.figure(figsize = (5, 4), facecolor = None) plt.imshow(wordcloud) plt.axis('off') plt.tight_layout(pad = 0) plt.show() # In[11]: show_wordcloud('text') # In[12]: show_wordcloud('clean_text') # In[13]: # nyyd oleme valmise teksti eelt88tluse sammuga, saab endasi minna vektoriseerimise juurde # #max_df #When building the vocabulary ignore terms that have a document #frequency strictly higher than the given threshold (corpus-specific stop words). # #min_df #When building the vocabulary ignore terms that have a #document frequency strictly lower than the given threshold. from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_df=0.7, min_df = 3) tf_idf_matrix = tfidf_vectorizer.fit_transform(df['clean_text']) # In[14]: tf_idf_matrix # In[15]: tf_idf_matrix[0] # In[16]: # #test_features = tfidf_vectorizer.get_feature_names_out(tf_idf_matrix) # In[20]: example_df = pd.DataFrame(tf_idf_matrix[3].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["TF-IDF"]) example_df = example_df.sort_values('TF-IDF', ascending=False) print (example_df.head(15)) # In[ ]: # tekstide v6rdlemine # kui on suurem kogus tekste, on parem v6rrelda tekstide cosine_similarity # In[23]: from sklearn.metrics.pairwise import cosine_similarity cosine_similarity_array = cosine_similarity(tf_idf_matrix[0: nr_of_docs], tf_idf_matrix) # In[25]: cosine_similarity_array[0:4] # In[27]: matches_df = pd.DataFrame(columns = [ 'similarity', 'text0_id', 'text1_id' ]) for x in range(0, nr_of_docs): for y in range(0, nr_of_docs): if x != y: row = [ cosine_similarity_array[x][y], df.iloc[x].doc_id, df.iloc[y].doc_id ] row = pd.Series(row, index = matches_df.columns) matches_df = matches_df.append(row, ignore_index = True) # In[30]: len(matches_df) # In[33]: matches_df.head() # In[35]: matches_df = matches_df.sort_values(by='similarity', ascending = False) # In[39]: matches_df.head(10) # In[38]: matches_df.hist()