{ "cells": [ { "cell_type": "code", "execution_count": 10, "id": "b389c49b-1459-4f30-ba11-a5a1f1a26ed3", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import sklearn\n", "import numpy as np\n", "import re\n", "import glob\n", "\n", "import matplotlib.pyplot as plt\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "9122a22c-66d6-4f05-ab86-4e21f2498ffe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "23\n" ] } ], "source": [ "CWD=os.getcwd()\n", "INPUT_FOLDER=CWD+\"\\\\input_texts\"\n", "all_files=glob.glob(INPUT_FOLDER+\"/*.txt\")\n", "NR_OF_DOCS=len(all_files)\n", "print(NR_OF_DOCS)" ] }, { "cell_type": "code", "execution_count": 20, "id": "dd42ec7d-aa36-468a-96d8-1fe99b830ebc", "metadata": {}, "outputs": [], "source": [ "li=[]\n", "for filename in all_files:\n", " item=[]\n", " doc_id=filename.rsplit('\\\\', 1)[1]\n", " item.append(doc_id)\n", " with open(filename, encoding=\"utf8\") as f:\n", " text=f.read()\n", " item.append(text);\n", " li.append(item)\n", " " ] }, { "cell_type": "code", "execution_count": 25, "id": "30b4c9ca-3480-4ff7-ba34-726acd2b0b8b", "metadata": {}, "outputs": [], "source": [ "df=pd.DataFrame(li, columns=['doc_id', 'text'])\n", "df['text']=df['text'].astype(str)\n", "stopwords_file=CWD+\"\\\\estonian-stopwords.txt\"\n", "with open(stopwords_file, encoding='utf8') as f:\n", " STOPWORDS=set(f.read().replace(\"\\n\", \" \").split())\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "3f8058b3-e010-4fdf-acfa-203175a1a593", "metadata": {}, "outputs": [], "source": [ "def text_prepare(text, STOPWORDS):\n", " #replace_by_space_re = re.compile('\\ |\\?|\\.|\\!|\\/|\\;|\\:|\\n|\\r')\n", " replace_by_space_re = re.compile('[\\n\\\"\\'/(){}\\[\\]\\|@,;#.]')\n", " text = re.sub(replace_by_space_re, ' ', text)\n", " text = re.sub(' +', ' ', text)\n", " text = text.lower()\n", " \n", " text = ' '.join([word for word in text.split() if word not in STOPWORDS])\n", " \n", " #to_replace_list = ['\\\\n', '"', '\\\\xa0'] \n", " #for element in to_replace_list:\n", " # text.replace(element, ' ') \n", " \n", " text = text.strip()\n", " return text" ] }, { "cell_type": "code", "execution_count": 30, "id": "bd8421b0-0a31-41a2-aed3-98bdaf0640d1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | doc_id | \n", "text | \n", "clean_text | \n", "
---|---|---|---|
0 | \n", "doc_104580264061_item.txt | \n", "Eesti taasiseseisvumise sümbol, Eesti vabaduse... | \n", "eesti taasiseseisvumise sümbol eesti vabaduse ... | \n", "
1 | \n", "doc_384045275280_item.txt | \n", "Retsensioon Eesnimi Perekonnanimi magistritö... | \n", "retsensioon eesnimi perekonnanimi magistritööl... | \n", "
2 | \n", "doc_485839256085_item.txt | \n", "Annotatsioon esilehe esiküljeleVene vanaususul... | \n", "annotatsioon esilehe esiküljelevene vanaususul... | \n", "
3 | \n", "doc_491521501740_item.txt | \n", "Tallinna Pedagoogika Ülikool Filoloogia teadu... | \n", "tallinna pedagoogika ülikool filoloogia teadus... | \n", "
4 | \n", "doc_491521501743_item.txt | \n", "Tallinna pedagoogikaülikool Eesti keele kui v... | \n", "tallinna pedagoogikaülikool eesti keele võõrke... | \n", "