Ülesanded * Kuva, mitu teksti on B2-keeletasemega SELECT COUNT(*) FROM dokmeta WHERE keeletase='B2'; 228 * Kuva iga keeletaseme kohta, mitu teksti on SELECT keeletase, COUNT(*) FROM dokmeta GROUP BY keeletase; * Kuva iga keeletaseme kohta, mitu protsenti tekstidest on sellel tasemel SELECT keeletase, ROUND(100*COUNT(*)/(SELECT COUNT(*) FROM dokmeta), 1) AS protsent FROM dokmeta GROUP BY keeletase; * Kuva iga keeletaseme kohta, mitu protsenti määratud keeletasemega tekstidest on sellel tasemel SELECT keeletase, ROUND(100*COUNT(*)/ (SELECT COUNT(*) FROM dokmeta WHERE keeletase IS NOT NULL), 1) AS protsent FROM dokmeta GROUP BY keeletase; CREATE TABLE dokarvud( kood VARCHAR(50) NOT NULL PRIMARY KEY, tahti INT, sonu INT, lauseid INT, vigu INT, veatyype INT, kolmetahelistepr DOUBLE, viietahelistepr DOUBLE, kymnejarohkemtahelistepr DOUBLE, kahesonalistepr DOUBLE, kolmesonalistepr DOUBLE, kuuekuni9sonalistepr DOUBLE, kymnekuni20sonalistepr DOUBLE, FOREIGN KEY(kood) REFERENCES dokmeta(kood) ); #Looge sarnane CREATE TABLE lause ka doksonaliigid faili näitel CREATE TABLE doksonaliigid( kood VARCHAR(50) NOT NULL PRIMARY KEY, A INT, C INT, D INT, G INT, H INT, I INT, J INT, K INT, N INT, P INT, S INT, U INT, V INT, X INT, Y INT, Z INT, kokku INT, FOREIGN KEY(kood) REFERENCES dokmeta(kood) ); CREATE TABLE sonaddokumendis( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, word VARCHAR(255), documentid VARCHAR(50), FOREIGN KEY(documentid) REFERENCES dokmeta(kood) ); ALTER TABLE sonaddokumendis DROP id; LOAD DATA LOCAL INFILE '/home/jaagup/public_html/2017/ab/sonaddokumendis.txt' INTO TABLE sonaddokumendis CHARACTER SET UTF8 FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES; home/jaagup/public_html/2017/ab Päring: milliseid sõnu on kasutatud A-keeletaseme tekstide juures SELECT * FROM dokmeta WHERE keeletase='A'; SELECT word FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase='A'; use if17_keelekorpus; * Mitu erinevat sõna on kasutatud A-keeletaseme tekstides SELECT COUNT(DISTINCT word) FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase='A'; * Järjesta A-keeletaseme tekstides kasutatud sõnad sageduse alusel SELECT word, COUNT(*) AS kogus FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase='A' GROUP BY word ORDER BY kogus DESC LIMIT 20; * Kuva sõnad, mida on kasutatud A-keeletaseme tekstides, kuid mitte B-keeletaseme tekstides SELECT word FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase='A' AND word NOT IN (SELECT word FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase='B'); SELECT word, COUNT(*) AS kogus FROM (SELECT word FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase='A' AND word NOT IN (SELECT word FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase='B')) AS abitabel GROUP BY word ORDER BY kogus DESC LIMIT 100; SELECT word, COUNT(*) AS kogus FROM (SELECT word FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase='A' AND word NOT IN (SELECT word FROM dokmeta INNER JOIN sonaddokumendis ON sonaddokumendis.documentid=dokmeta.kood WHERE keeletase IN ('B', 'B1', 'B2', 'C', 'C1', 'C2'))) AS abitabel GROUP BY word ORDER BY kogus DESC LIMIT 100; ALTER TABLE sonaddokumendis ADD id INT NOT NULL AUTO_INCREMENT PRIMARY KEY; CREATE TABLE ngram3( tekstikood VARCHAR(50) NOT NULL, ngram3 CHAR(3) NOT NULL, alguskoht INT, PRIMARY KEY (tekstikood, alguskoht), FOREIGN KEY (tekstikood) REFERENCES dokmeta(kood) ); LOAD DATA LOCAL INFILE '/home/jaagup/public_html/2017/ab/ngram3.txt' INTO TABLE ngram3 CHARACTER SET UTF8 FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES; * Kuvage 15 populaarsemat ngrami tabelist ngram3 SELECT ngram3, COUNT(*) AS kogus FROM ngram3 GROUP BY ngram3 ORDER BY kogus DESC LIMIT 15; CREATE INDEX ngram3_indeks1 ON ngram3(ngram3); Kuvage tabeli 5 esimest ngram-i koos vastava teksti keeletasemega (JOIN) * Leidke 15 populaarsemat ngrami tekstidel keeletasemega A, A1 või A2 ; SELECT ngram3, COUNT(*) AS kogus FROM dokmeta INNER JOIN ngram3 ON ngram3.tekstikood=dokmeta.kood WHERE keeletase LIKE 'A%' GROUP BY ngram3 HAVING kogus>=100 ORDER BY kogus DESC; * Leidke 15 populaarsemat ngrami tekstidel keeletasemega B, B1 või B2 ; SELECT ngram3, COUNT(*) AS kogus FROM dokmeta INNER JOIN ngram3 ON ngram3.tekstikood=dokmeta.kood WHERE keeletase LIKE 'B%' GROUP BY ngram3 ORDER BY kogus DESC LIMIT 15; +--------+-------+ | ngram3 | kogus | +--------+-------+ | SZS | 13164 | | ZPV | 11257 | | SZP | 11156 | | SSZ | 9628 | | ASZ | 9165 | | VSZ | 8488 | | SZD | 6481 | | ZSZ | 6145 | | VAS | 5998 | | VPS | 5939 | | SVS | 5828 | | ZSV | 5591 | | SJS | 5578 | | PSV | 5578 | | PVS | 5020 | +--------+-------+ Leidke, millised ngramid esinevad mõlemi keeletaseme ülaotsas SELECT ngram3 FROM dokmeta INNER JOIN ngram3 ON ngram3.tekstikood=dokmeta.kood WHERE keeletase LIKE 'B%' AND ngram3 IN ( SELECT ngram3 FROM dokmeta INNER JOIN ngram3 ON ngram3.tekstikood=dokmeta.kood WHERE keeletase LIKE 'A%' GROUP BY ngram3 HAVING COUNT(*)>1000 ORDER BY COUNT(*) DESC ) GROUP BY ngram3 HAVING COUNT(*)>5000 ORDER BY COUNT(*) DESC; +--------+ | ngram3 | +--------+ | SZS | | ZPV | | SZP | | SSZ | | ASZ | | VSZ | | SZD | | ZSZ | | VAS | | VPS | | SVS | | ZSV | | SJS | | PSV | | PVS | +--------+ Millised on ainult A-tasemel SELECT ngram3 FROM dokmeta INNER JOIN ngram3 ON ngram3.tekstikood=dokmeta.kood WHERE keeletase LIKE 'A%' AND ngram3 NOT IN ( SELECT ngram3 FROM dokmeta INNER JOIN ngram3 ON ngram3.tekstikood=dokmeta.kood WHERE keeletase LIKE 'B%' GROUP BY ngram3 HAVING COUNT(*)>500 ORDER BY COUNT(*) DESC ) GROUP BY ngram3 HAVING COUNT(*)>500 ORDER BY COUNT(*) DESC; Millised on ainult B-tasemel SELECT ngram3 FROM dokmeta INNER JOIN ngram3 ON ngram3.tekstikood=dokmeta.kood WHERE keeletase LIKE 'B%' AND ngram3 NOT IN ( SELECT ngram3 FROM dokmeta INNER JOIN ngram3 ON ngram3.tekstikood=dokmeta.kood WHERE keeletase LIKE 'A%' GROUP BY ngram3 HAVING COUNT(*)>500 ORDER BY COUNT(*) DESC ) GROUP BY ngram3 HAVING COUNT(*)>1000 ORDER BY COUNT(*) DESC; CREATE TABLE ngram4( tekstikood VARCHAR(50) NOT NULL, ngram5 CHAR(4) NOT NULL, alguskoht INT, PRIMARY KEY (tekstikood, alguskoht), FOREIGN KEY (tekstikood) REFERENCES dokmeta(kood) ); CREATE TABLE ngram1( tekstikood VARCHAR(50) NOT NULL, sona VARCHAR(255) NOT NULL, ngram1 CHAR(3) NOT NULL, alguskoht INT, suurtahega VARCHAR(50), lemma VARCHAR(50), PRIMARY KEY (tekstikood, alguskoht), FOREIGN KEY (tekstikood) REFERENCES dokmeta(kood) ); LOAD DATA LOCAL INFILE '/home/jaagup/public_html/2017/ab/ngram1.txt' INTO TABLE ngram1 CHARACTER SET UTF8 FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;