{ "cells": [ { "cell_type": "code", "execution_count": 10, "id": "b389c49b-1459-4f30-ba11-a5a1f1a26ed3", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import sklearn\n", "import numpy as np\n", "import re\n", "import glob\n", "\n", "import matplotlib.pyplot as plt\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "9122a22c-66d6-4f05-ab86-4e21f2498ffe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "23\n" ] } ], "source": [ "CWD=os.getcwd()\n", "INPUT_FOLDER=CWD+\"\\\\input_texts\"\n", "all_files=glob.glob(INPUT_FOLDER+\"/*.txt\")\n", "NR_OF_DOCS=len(all_files)\n", "print(NR_OF_DOCS)" ] }, { "cell_type": "code", "execution_count": 20, "id": "dd42ec7d-aa36-468a-96d8-1fe99b830ebc", "metadata": {}, "outputs": [], "source": [ "li=[]\n", "for filename in all_files:\n", " item=[]\n", " doc_id=filename.rsplit('\\\\', 1)[1]\n", " item.append(doc_id)\n", " with open(filename, encoding=\"utf8\") as f:\n", " text=f.read()\n", " item.append(text);\n", " li.append(item)\n", " " ] }, { "cell_type": "code", "execution_count": 25, "id": "30b4c9ca-3480-4ff7-ba34-726acd2b0b8b", "metadata": {}, "outputs": [], "source": [ "df=pd.DataFrame(li, columns=['doc_id', 'text'])\n", "df['text']=df['text'].astype(str)\n", "stopwords_file=CWD+\"\\\\estonian-stopwords.txt\"\n", "with open(stopwords_file, encoding='utf8') as f:\n", " STOPWORDS=set(f.read().replace(\"\\n\", \" \").split())\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "3f8058b3-e010-4fdf-acfa-203175a1a593", "metadata": {}, "outputs": [], "source": [ "def text_prepare(text, STOPWORDS):\n", " #replace_by_space_re = re.compile('\\ |\\?|\\.|\\!|\\/|\\;|\\:|\\n|\\r')\n", " replace_by_space_re = re.compile('[\\n\\\"\\'/(){}\\[\\]\\|@,;#.]')\n", " text = re.sub(replace_by_space_re, ' ', text)\n", " text = re.sub(' +', ' ', text)\n", " text = text.lower()\n", " \n", " text = ' '.join([word for word in text.split() if word not in STOPWORDS])\n", " \n", " #to_replace_list = ['\\\\n', '"', '\\\\xa0'] \n", " #for element in to_replace_list:\n", " # text.replace(element, ' ') \n", " \n", " text = text.strip()\n", " return text" ] }, { "cell_type": "code", "execution_count": 30, "id": "bd8421b0-0a31-41a2-aed3-98bdaf0640d1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doc_idtextclean_text
0doc_104580264061_item.txtEesti taasiseseisvumise sümbol, Eesti vabaduse...eesti taasiseseisvumise sümbol eesti vabaduse ...
1doc_384045275280_item.txtRetsensioon Eesnimi Perekonnanimi magistritö...retsensioon eesnimi perekonnanimi magistritööl...
2doc_485839256085_item.txtAnnotatsioon esilehe esiküljeleVene vanaususul...annotatsioon esilehe esiküljelevene vanaususul...
3doc_491521501740_item.txtTallinna Pedagoogika Ülikool Filoloogia teadu...tallinna pedagoogika ülikool filoloogia teadus...
4doc_491521501743_item.txtTallinna pedagoogikaülikool Eesti keele kui v...tallinna pedagoogikaülikool eesti keele võõrke...
\n", "
" ], "text/plain": [ " doc_id \\\n", "0 doc_104580264061_item.txt \n", "1 doc_384045275280_item.txt \n", "2 doc_485839256085_item.txt \n", "3 doc_491521501740_item.txt \n", "4 doc_491521501743_item.txt \n", "\n", " text \\\n", "0 Eesti taasiseseisvumise sümbol, Eesti vabaduse... \n", "1 Retsensioon Eesnimi Perekonnanimi magistritö... \n", "2 Annotatsioon esilehe esiküljeleVene vanaususul... \n", "3 Tallinna Pedagoogika Ülikool Filoloogia teadu... \n", "4 Tallinna pedagoogikaülikool Eesti keele kui v... \n", "\n", " clean_text \n", "0 eesti taasiseseisvumise sümbol eesti vabaduse ... \n", "1 retsensioon eesnimi perekonnanimi magistritööl... \n", "2 annotatsioon esilehe esiküljelevene vanaususul... \n", "3 tallinna pedagoogika ülikool filoloogia teadus... \n", "4 tallinna pedagoogikaülikool eesti keele võõrke... " ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"clean_text\"]=df[\"text\"].apply(lambda x: text_prepare(x, STOPWORDS))\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 31, "id": "ad308850-2c7c-43c9-ab1a-0fbb3c1ad789", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "doc_id doc_104580264061_item.txt\n", "text Eesti taasiseseisvumise sümbol, Eesti vabaduse...\n", "clean_text eesti taasiseseisvumise sümbol eesti vabaduse ...\n", "Name: 0, dtype: object" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "e7a352b4-55c3-4e81-8465-174dd3e4903a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }