import os import json kaust="/home/jaagup/public_html/2025/digihoidla/kood3/" #name_file_path = "isikud_rara_ladina_2.txt" #name_file_path = "knab_koos_2.txt" #name_file_path = "org_known_names.txt" name_file_path = "gpebaas.txt" #category = "per" #category = "org" category = "gpe" #category = "title" #category = "prod" #Hulk nimeosade talletamiseks partial_name_set = set() #Hulk täisnimede talletamiseks full_name_set = set() #Nimede faili põhjal koostatakse loend with open(kaust+name_file_path, encoding = "utf-8") as name_file: name_list = name_file.readlines() for name in name_list: name = name.lower().strip() #Üheosalised nimed talletatakse täisnimede hulka if len(name.split()) == 0: continue if len(name.split()) == 1: full_name_set.add(name) #Mitmeosalise nime komponendid talletatakse nimeosade hulka #ja lisatakse tühikuga eraldatuna täisnimede hulka else: full_name_set.add(name) name_parts = name.split() #print(name_parts) last_name = name_parts[-1] first_name_parts = name_parts[:-1] partial_name_set.add(last_name) for name_part in first_name_parts: partial_name_set.add(name_part) def get_names(directory, filename): """Funktsioon otsib failist nimesid ja võrdleb neid teadaolevate nimede loendiga. Tuntud ja tundmatud nimed väljastatakse eraldi.""" file_text = open(directory + "/" + filename, "r", encoding = "utf-8") #Eelnevate märgendusridade loend mitmeosaliste nimede tuvastamiseks previous_lines = [] #Muutuja üheosaliste nimede tuvastamiseks single_full_name = "" #Failist leitud nimede hulk, mis kattub etteantud nimedega names_found = set() #Failist leitud uute nimede hulk new_names = set() #Iga märgendusrea väljad salvestatakse loendina for line in file_text.readlines()[1:]: line_text = line.lower().strip().split(";") #Alakriipsude ja võrdusmärkide eemaldamine liitsõnade ja tuletiste algvormidest line_text = [sub.replace("_", "") for sub in line_text] line_text = [sub.replace("=", "") for sub in line_text] #Kontroll, kas sõna on teatud liiki nimi: if line_text[11].endswith(category) or line_text[12].endswith(category): #Mitmeosaliste nimede tuvastamine - kontroll, kas eelmine sõna on nimi if len(previous_lines) > 0 and (previous_lines[-1][5] == "propn" or previous_lines[-1][11]!= "o" or previous_lines[-1][12]!= "o"): #Kontroll, kas praegune sõna sisaldub nimeosade hulgas if line_text[3] in partial_name_set or line_text[4] in partial_name_set: #Kaheosalise nime eri kujud - #moodustatud algvormidest, sõnavormidest või nende kombinatsioonist name_pair1 = previous_lines[-1][4] + " " + line_text[4] name_pair2 = previous_lines[-1][3] + " " + line_text[4] name_pair3 = previous_lines[-1][3] + " " + line_text[3] name_pair4 = previous_lines[-1][4] + " " + line_text[3] #Kontroll, kas mõni nimepaar sisaldub täisnimede hulgas - #kui jah, talletatakse see leitud nimede hulgas if name_pair1 in full_name_set: names_found.add(name_pair1) elif name_pair2 in full_name_set: names_found.add(name_pair2) elif name_pair3 in full_name_set: names_found.add(name_pair3) elif name_pair4 in full_name_set: names_found.add(name_pair4) #Kui ükski nimepaar täisnimede hulgas ei esine, kontrollitakse, #kas üle-eelmine sõna on nimi - kui jah, otsitakse kolmeosalist nime else: if len(previous_lines) > 1 and (previous_lines[-2][5] == "propn" or previous_lines[-2][11] != "o" or previous_lines[-2][12] != "o"): #Kõige tõenäolisemad kolmeosalise nime kujud name_triplet1 = previous_lines[-2][4] + " " + previous_lines[-1][4]\ + " " + line_text[4] name_triplet2 = previous_lines[-2][3] + " " + previous_lines[-1][3]\ + " " + line_text[4] name_triplet3 = previous_lines[-2][3] + " " + previous_lines[-1][4]\ + " " + line_text[4] name_triplet4 = previous_lines[-2][3] + " " + previous_lines[-1][3]\ + " " + line_text[3] #Kontroll, kas mõni nimekolmik sisaldub täisnimede hulgas - #kui jah, talletatakse see leitud nimede hulgas if name_triplet1 in full_name_set: names_found.add(name_triplet1) elif name_triplet2 in full_name_set: names_found.add(name_triplet2) elif name_triplet3 in full_name_set: names_found.add(name_triplet3) elif name_triplet4 in full_name_set: names_found.add(name_triplet4) #Kui ükski nimekolmik täisnimede hulgas ei esine, kontrollitakse, #kas see sisaldab kahe- või kolmeosalist nime - #kui jah, lisatakse nimi tundmatute nimede hulka else: previous_name1 = previous_lines[-2] previous_name2 = previous_lines[-1] #Kontroll, kas praegune nimi kuulub eelnevaga kokku if (line_text[11][-3:] == previous_name2[11][-3:]\ and line_text[11].startswith("i")) or \ (line_text[12][-3:] == previous_name2[12][-3:]\ and line_text[12].startswith("i")): #Kontroll, kas eelmine ja üle-eelmine nimi kuuluvad kokku - #kui jah, talletatakse uus nimi kujul: sõnavorm + lemma + lemma if (previous_name1[11][-3:] == previous_name2[11][-3:]\ and previous_name1[11].startswith("b")\ and previous_name2[11].startswith("i")) or \ (previous_name1[12][-3:] == previous_name2[12][-3:]\ and previous_name1[12].startswith("b")\ and previous_name2[12].startswith("i")): new_name = previous_name1[3] + " " + previous_name1[4]\ + " " + line_text[4] if new_name not in list(names_found) and new_name.split()[0] != new_name.split()[1]: new_names.add(new_name) #Kui kokku kuuluvad vaid praegune ja eelmine nimi, #siis talletatakse uus nimi kujul: sõnavorm + lemma else: if previous_name2[11].startswith("b") or \ previous_name2[12].startswith("b"): new_name = previous_name2[3] + " " + line_text[4] new_names.add(new_name) #Kui üle-eelmine sõna ei ole nimi, kontrollitakse, #kas praegune ja eelmine nimi kuuluvad kokku - #kui jah, talletatakse uus nimi kujul: sõnavorm + lemma else: previous_name = previous_lines[-1] if (line_text[11][-3:] == previous_name[11][-3:]\ and line_text[11].startswith("i")\ and previous_name[11].startswith("b")) or \ (line_text[12][-3:] == previous_name[12][-3:]\ and line_text[12].startswith("i")\ and previous_name[12].startswith("b")): new_name = previous_name[3] + " " + line_text[4] if new_name not in list(names_found): new_names.add(new_name) #Kui praegune nimi ei esine nimeosade hulgas, kontrollitakse, #kas see kuulub eelmise nimega kokku else: name_cat_1 = "" name_cat_2 = "" previous_name = previous_lines[-1] if line_text[11][-3:] == previous_name[11][-3:]\ and line_text[11].startswith("i"): name_cat_1 = line_text[11][-3:] if line_text[12][-3:] == previous_name[12][-3:]\ and line_text[12].startswith("i"): name_cat_2 = line_text[12][-3:] #Kontroll, kas tegemist on kolmeosalise nimega - #kui jah, talletatakse uus nimi kujul: sõnavorm + lemma + lemma if len(previous_lines) > 1 and ((previous_lines[-2][11].endswith(name_cat_1)\ and previous_lines[-2][11].startswith("b")) or (previous_lines[-2][12].endswith(name_cat_2)\ and previous_lines[-2][12].startswith("b"))): new_name = previous_lines[-2][3] + " " + previous_name[4]\ + " " + line_text[4] if new_name not in list(names_found) and new_name.split()[0] != new_name.split()[1]: new_names.add(new_name) #Kui uus nimi on kaheosaline, talletatakse see kujul: sõnavorm + lemma else: new_name = previous_name[3] + " " + line_text[4] if new_name not in list(names_found): new_names.add(new_name) #Kui eelmine sõna ei ole nimi, kontrollitakse, kas sõna sisaldub täisnimede hulgas - #kui jah, salvestatakse see loendiga kattuval kujul, aga ei lisata veel leitud nimede hulka else: #Nimi leitakse ja salvestatakse lemma kujul if line_text[4] in full_name_set: single_full_name = line_text[4] #Lemmatiseerimisvea tõttu võidakse nimi leida ja salvestada sõnavormina elif line_text[3] in full_name_set: single_full_name = line_text[3] #Kui sõna ei sisaldu täisnimede ega nimeosade loendis, salvestatakse see lemma kujul else: if line_text[3] not in partial_name_set and line_text[4] not in partial_name_set: single_full_name = line_text[4] #Kui sõna ei ole märgendatud nimena, kontrollitakse, kas see kuulub siiski eelneva nimega kokku #Juhul kui eelnenud nimi on üheosaline, talletatakse see leitud või uute nimede hulgas else: if len(previous_lines) != 0 and (single_full_name == previous_lines[-1][3] or single_full_name == previous_lines[-1][4]): potential_name = single_full_name + " " + line_text[3] if potential_name in full_name_set: names_found.add(potential_name) else: if single_full_name in full_name_set: names_found.add(single_full_name) else: new_names.add(single_full_name) single_full_name = "" #Märgendusrida talletatakse järgmise tsükli jaoks eelnevate ridade loendis previous_lines.append(line_text) #Meeles hoitakse kahte eelnevat rida if len(previous_lines) > 2: previous_lines.pop(0) return names_found, new_names #Sõnastikud nimede ja failide seoste talletamiseks known_name_dict = {} new_name_dict = {} item_directory = kaust+"../detailid" nr=0 for entry in list(os.scandir(item_directory)): if entry.name.endswith(".txt"): nr+=1 if nr % 1000 == 0 : print(nr, entry.name) print(new_name_dict) names_in_file = get_names(item_directory, entry.name) #Tuntud nimede lisamine sõnastikku for name in names_in_file[0]: if name in known_name_dict: known_name_dict[name] = known_name_dict[name] + "," + entry.name else: known_name_dict[name] = entry.name #Uute nimede lisamine sõnastikku for name in names_in_file[1]: if name in new_name_dict: new_name_dict[name] = new_name_dict[name] + "," + entry.name else: new_name_dict[name] = entry.name known_names_file_path = kaust+category + "_known_names.json" with open(known_names_file_path, "w", encoding = "utf-8") as f_known: # json.dump(known_name_dict, f_known, ensure_ascii = False) json.dump(known_name_dict, f_known) new_names_file_path = kaust+category + "_new_names.json" with open(new_names_file_path, "w", encoding = "utf-8") as f_new: # json.dump(new_name_dict, f_new, ensure_ascii = False) json.dump(new_name_dict, f_new) #Seotud failide arvu loendamine ja andmete salvestamine tabelina input_file_path_1 = known_names_file_path output_file_path_1 = known_names_file_path[:-5] + "_freq.txt" with open(input_file_path_1, encoding="utf-8") as json_file: data = json.load(json_file) with open(output_file_path_1, "w", encoding="utf-8") as freq_list: freq_list.write("name;category;frequency;entries\n") dkeys=list(data.keys()) dkeys.sort(key=lambda rida: -len(data[rida].split(","))) # for key in data: for key in dkeys: freq_list.write(key+";") freq_list.write(category+";") mentions = data[key].split(",") freq_list.write(str(len(mentions))+";") freq_list.write(data[key]+"\n") input_file_path_2 = new_names_file_path output_file_path_2 = new_names_file_path[:-5] + "_freq.txt" with open(input_file_path_2, encoding="utf-8") as json_file: data = json.load(json_file) with open(output_file_path_2, "w", encoding="utf-8") as freq_list: freq_list.write("name;category;frequency;entries\n") dkeys=list(data.keys()) dkeys.sort(key=lambda rida: -len(data[rida].split(","))) # for key in data: for key in dkeys: freq_list.write(key+";") freq_list.write(category+";") mentions = data[key].split(",") freq_list.write(str(len(mentions))+";") freq_list.write(data[key]+"\n")