[jaagup@greeny korpus]$ mysql -uif17 -pif17 if17_keelekorpus Reading table information for completion of table and column names You can turn off this feature to get a quicker startup with -A Welcome to the MariaDB monitor. Commands end with ; or \g. Your MariaDB connection id is 9892 Server version: 10.2.8-MariaDB MariaDB Server Copyright (c) 2000, 2017, Oracle, MariaDB Corporation Ab and others. Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. MariaDB [if17_keelekorpus]> SELECT * FROM korpusenimed; +-------------+--------------------------------+ | korpusekood | korpusenimi | +-------------+--------------------------------+ | cFOoRQekA | EVKK | | cFqPphvYi | Eesti keele olümpiaadi tööd | | cgSRJPKTr | Vene keel kui emakeel | | clWmOIrLa | REKKi kogud | | cwUprXCTL | Eesti teaduskeel | | cwUSEqQLt | Akadeemiline õppijakeel | | cYDRkpymb | K1 referentskorpus | | cZjHWUPtD | Vene keel kui võõrkeel | +-------------+--------------------------------+ 8 rows in set (0.00 sec) MariaDB [if17_keelekorpus]> SELECT * FROM dokmeta LIMIT 5; +-----------------------+-----------+------------+------------+---------+-------+--------+-------+---------+----------+-----------+---------+-------------+ | kood | korpus | tekstikeel | tekstityyp | elukoht | taust | vanus | sugu | emakeel | kodukeel | keeletase | haridus | abivahendid | +-----------------------+-----------+------------+------------+---------+-------+--------+-------+---------+----------+-----------+---------+-------------+ | doc_100636852915_item | cFOoRQekA | eesti | essee | idaviru | op | kuni18 | naine | vene | vene | B | pohi | ei | | doc_100636852916_item | cFOoRQekA | eesti | muu | idaviru | op | kuni18 | naine | vene | vene | B | pohi | ei | | doc_100636852917_item | cFOoRQekA | eesti | essee | idaviru | op | kuni18 | naine | vene | vene | B | pohi | ei | | doc_1010138197_item | cFOoRQekA | eesti | muu | tallinn | ylop | kuni26 | naine | vene | vene | A | kesk | ei | | doc_1010138198_item | cFOoRQekA | eesti | muu | tallinn | ylop | kuni26 | naine | vene | vene | B | kesk | ei | +-----------------------+-----------+------------+------------+---------+-------+--------+-------+---------+----------+-----------+---------+-------------+ 5 rows in set (0.00 sec) MariaDB [if17_keelekorpus]> MariaDB [if17_keelekorpus]> SELECT COALESCE(keeletase, 'määramata') AS keeletase, COUNT(*) AS kogus FROM dokmeta GROUP BY dokmeta.keeletase; +-------------+-------+ | keeletase | kogus | +-------------+-------+ | määramata | 8660 | | A | 1378 | | A1 | 1 | | A2 | 217 | | B | 1154 | | B1 | 412 | | B2 | 228 | | C | 359 | | C1 | 130 | | C2 | 185 | +-------------+-------+ 10 rows in set (0.01 sec) SELECT COALESCE(keeletase, 'määramata') AS keeletase, COUNT(*) AS kogus, FROM dokmeta GROUP BY dokmeta.keeletase; --Lisa veebilehele ka protsenditulp SELECT COALESCE(keeletase, 'määramata') AS keeletase, COUNT(*) AS kogus, COUNT(*) / (SELECT COUNT(*) FROM dokmeta) AS suhe FROM dokmeta GROUP BY dokmeta.keeletase; +-------------+-------+--------+ | keeletase | kogus | suhe | +-------------+-------+--------+ | määramata | 8660 | 0.6806 | | A | 1378 | 0.1083 | | A1 | 1 | 0.0001 | | A2 | 217 | 0.0171 | | B | 1154 | 0.0907 | | B1 | 412 | 0.0324 | | B2 | 228 | 0.0179 | | C | 359 | 0.0282 | | C1 | 130 | 0.0102 | | C2 | 185 | 0.0145 | +-------------+-------+--------+ SELECT COALESCE(keeletase, 'määramata') AS keeletase, COUNT(*) AS kogus, ROUND(100* COUNT(*) / (SELECT COUNT(*) FROM dokmeta), 1) AS protsent FROM dokmeta GROUP BY dokmeta.keeletase; +-------------+-------+----------+ | keeletase | kogus | protsent | +-------------+-------+----------+ | määramata | 8660 | 68.1 | | A | 1378 | 10.8 | | A1 | 1 | 0.0 | | A2 | 217 | 1.7 | | B | 1154 | 9.1 | | B1 | 412 | 3.2 | | B2 | 228 | 1.8 | | C | 359 | 2.8 | | C1 | 130 | 1.0 | | C2 | 185 | 1.5 | +-------------+-------+----------+ MariaDB [if17_keelekorpus]> SHOW TABLES; +----------------------------+ | Tables_in_if17_keelekorpus | +----------------------------+ | dokarvud | | dokmeta | | doksonaliigid | | elukohad | | haridustasemed | | keeled | | keeletasemed | | korpusenimed | | ngram1 | | ngram2 | | ngram3 | | ngram4 | | ngram5 | | sonaddokumendis | | sonaliikide_lyhendid | | taustad | | tekstityybid | | vanusetasemed | +----------------------------+ 18 rows in set (0.00 sec) MariaDB [if17_keelekorpus]> SELECT * FROM sonaliikide_lyhendid; +-------------+---------------------------+ | liigilyhend | liigikirjeldus | +-------------+---------------------------+ | A | omadussõna algvõrre | | C | omadussõna keskvõrre | | D | määrsõna | | G | käändumatu omadussõna | | H | pärisnimi | | I | hüüdsõna | | J | sidesõna | | K | kaassõna | | N | põhiarvsõna | | P | asesõna | | S | nimisõna | | U | omadussõna ülivõrre | | V | tegusõna | | X | verbi juurde kuuluv sõna | | Y | lühend | | Z | lausemärk | +-------------+---------------------------+ MariaDB [if17_keelekorpus]> SELECT COUNT(*) FROM ngram2; +----------+ | COUNT(*) | +----------+ | 3503295 | +----------+ MariaDB [if17_keelekorpus]> SELECT COUNT(DISTINCT ngram2) FROM ngram2; +------------------------+ | COUNT(DISTINCT ngram2) | +------------------------+ | 274 | +------------------------+ EXPLAIN ngram2; +------------+-------------+------+-----+---------+-------+ | Field | Type | Null | Key | Default | Extra | +------------+-------------+------+-----+---------+-------+ | tekstikood | varchar(50) | NO | PRI | NULL | | | ngram2 | char(3) | NO | | NULL | | | alguskoht | int(11) | NO | PRI | NULL | | +------------+-------------+------+-----+---------+-------+ Kuvage iga paari kohta, mitu korda see esineb SELECT ngram2, COUNT(*) FROM ngram2 GROUP BY ngram2; +--------+----------+ | ngram2 | COUNT(*) | +--------+----------+ | AA | 5359 | | AC | 216 | | AD | 4735 | | AG | 651 | | AH | 3393 | | AI | 83 | | AJ | 8873 | | AK | 345 | | AN | 860 | | AO | 470 | | AP | 4277 | Kuvage paarid esinemiste arvu järjekorras SELECT ngram2, COUNT(*) FROM ngram2 GROUP BY ngram2 ORDER BY COUNT(*); | PS | 97811 | | AS | 103158 | | ZP | 106546 | | VS | 117187 | | ZS | 133291 | | SV | 134015 | | PV | 139244 | | SS | 244501 | | SZ | 334535 | +--------+----------+ Näidake iga paari suhteline sagedus (esinemiste kord jagatuna üldarvuga) SELECT ngram2, COUNT(*)/(SELECT COUNT(*) FROM ngram2) AS suhe FROM ngram2 GROUP BY ngram2 ORDER BY COUNT(*); VS | 0.0335 | | ZS | 0.0380 | | SV | 0.0383 | | PV | 0.0397 | | SS | 0.0698 | | SZ | 0.0955 | +--------+--------+ -- Leidke samad tulemused soome emakeelega autorite tekstide kohta MariaDB [if17_keelekorpus]> explain dokmeta; +-------------+-------------------------+------+-----+---------+-------+ | Field | Type | Null | Key | Default | Extra | +-------------+-------------------------+------+-----+---------+-------+ | kood | varchar(50) | NO | PRI | NULL | | | korpus | varchar(20) | NO | MUL | NULL | | | tekstikeel | varchar(20) | YES | MUL | NULL | | | tekstityyp | varchar(20) | YES | MUL | NULL | | | elukoht | varchar(20) | YES | MUL | NULL | | | taust | varchar(20) | YES | MUL | NULL | | | vanus | varchar(20) | YES | MUL | NULL | | | sugu | enum('naine','mees','') | YES | | NULL | | | emakeel | varchar(20) | YES | MUL | NULL | | | kodukeel | varchar(20) | YES | MUL | NULL | | | keeletase | varchar(20) | YES | MUL | NULL | | | haridus | varchar(20) | YES | MUL | NULL | | | abivahendid | enum('jah','ei','') | YES | | NULL | | +-------------+-------------------------+------+-----+---------+-------+ MariaDB [if17_keelekorpus]> SELECT * FROM dokmeta LIMIT 5; +-----------------------+-----------+------------+------------+---------+-------+--------+-------+---------+----------+-----------+---------+-------------+ | kood | korpus | tekstikeel | tekstityyp | elukoht | taust | vanus | sugu | emakeel | kodukeel | keeletase | haridus | abivahendid | +-----------------------+-----------+------------+------------+---------+-------+--------+-------+---------+----------+-----------+---------+-------------+ | doc_100636852915_item | cFOoRQekA | eesti | essee | idaviru | op | kuni18 | naine | vene | vene | B | pohi | ei | | doc_100636852916_item | cFOoRQekA | eesti | muu | idaviru | op | kuni18 | naine | vene | vene | B | pohi | ei | | doc_100636852917_item | cFOoRQekA | eesti | essee | idaviru | op | kuni18 | naine | vene | vene | B | pohi | ei | | doc_1010138197_item | cFOoRQekA | eesti | muu | tallinn | ylop | kuni26 | naine | vene | vene | A | kesk | ei | | doc_1010138198_item | cFOoRQekA | eesti | muu | tallinn | ylop | kuni26 | naine | vene | vene | B | kesk | ei | +-----------------------+-----------+------------+------------+---------+-------+--------+-------+---------+----------+-----------+---------+-------------+ 5 rows in set (0.00 sec) MariaDB [if17_keelekorpus]> SELECT * FROM ngram2 LIMIT 5 -> ; +-----------------------+--------+-----------+ | tekstikood | ngram2 | alguskoht | +-----------------------+--------+-----------+ | doc_100636852915_item | SP | 1 | | doc_100636852915_item | PV | 2 | | doc_100636852915_item | VD | 3 | | doc_100636852915_item | DS | 4 | | doc_100636852915_item | SJ | 5 | +-----------------------+--------+-----------+ 5 rows in set (0.00 sec) SELECT kood FROM dokmeta WHERE emakeel='soome'; MariaDB [if17_keelekorpus]> SELECT kood FROM dokmeta WHERE emakeel='soome'; +-----------------------+ | kood | +-----------------------+ | doc_104580264060_item | | doc_104580264061_item | | doc_104580264062_item | | doc_104580264063_item | SELECT ngram2 FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='soome'; | ZV | | VD | | DZ | +--------+ 120630 rows in set (0.28 sec) -- lugege iga erineva paari kohta kokku, mitu neid on SELECT ngram2, COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='soome' GROUP BY ngram2 ORDER BY COUNT(*); | AS | 4074 | | VS | 4693 | | PV | 5220 | | SS | 5424 | | SV | 6043 | | SZ | 9100 | +--------+----------+ 235 rows in set (0.16 sec) -- leidke suhtelised sagedused SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='soome'; +----------+ | COUNT(*) | +----------+ | 120630 | +----------+ SELECT ngram2, COUNT(*)/ (SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='soome') AS soomesuhe FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='soome' GROUP BY ngram2 ORDER BY COUNT(*); | PS | 0.0300 | | VD | 0.0326 | | ZP | 0.0333 | | AS | 0.0338 | | VS | 0.0389 | | PV | 0.0433 | | SS | 0.0450 | | SV | 0.0501 | | SZ | 0.0754 | +--------+-----------+ 235 rows in set (0.32 sec) -- leidke samad andmed vene emakeelega autorite tekstide kohta SELECT ngram2, COUNT(*)/ (SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='vene') AS venesuhe FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='vene' GROUP BY ngram2 ORDER BY COUNT(*); | AS | 0.0266 | | VS | 0.0295 | | PV | 0.0300 | | YS | 0.0318 | | SV | 0.0378 | | ZS | 0.0484 | | SS | 0.1048 | | SZ | 0.1065 | +--------+----------+ 266 rows in set (1.24 sec) keel, paar, suhe SELECT 'vene' AS emakeel, ngram2, COUNT(*)/ (SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='vene') AS suhe FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='vene' GROUP BY ngram2 UNION ALL SELECT 'soome', ngram2, COUNT(*)/ (SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='soome') AS soomesuhe FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='soome' GROUP BY ngram2; | soome | ZU | 0.0001 | | soome | ZV | 0.0148 | | soome | ZY | 0.0014 | | soome | ZZ | 0.0025 | +---------+--------+--------+ 501 rows in set (1.44 sec) SELECT emakeel, ngram2, suhe FROM (SELECT 'vene' AS emakeel, ngram2, COUNT(*)/ (SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='vene') AS suhe FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='vene' GROUP BY ngram2 UNION ALL SELECT 'soome', ngram2, COUNT(*)/ (SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='soome') AS suhe FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='soome' GROUP BY ngram2) AS vahetabel WHERE vahetabel.suhe>0.008; SELECT ngram2, MAX(suhe)/MIN(suhe) AS keeltesuhe FROM (SELECT 'vene' AS emakeel, ngram2, COUNT(*)/ (SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='vene') AS suhe FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='vene' GROUP BY ngram2 UNION ALL SELECT 'soome', ngram2, COUNT(*)/ (SELECT COUNT(*) FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE emakeel='soome') AS suhe FROM dokmeta INNER JOIN ngram2 ON dokmeta.kood=ngram2.tekstikood WHERE dokmeta.emakeel='soome' GROUP BY ngram2) AS vahetabel WHERE vahetabel.suhe>0.008 GROUP BY ngram2 ORDER BY keeltesuhe; +--------+------------+ | ngram2 | keeltesuhe | +--------+------------+ | HV | 1.00000000 | | JV | 1.00000000 | | SA | 1.00000000 | | SD | 1.00000000 | | SK | 1.00000000 | | VV | 1.00000000 | | VH | 1.00000000 | | ZY | 1.00000000 | | SY | 1.00000000 | | YS | 1.00000000 | | DD | 1.00000000 | | VP | 1.00625000 | | HZ | 1.01960784 | | VZ | 1.03271028 | | HS | 1.05000000 | | ZV | 1.06081081 | | ZJ | 1.13043478 | | DZ | 1.18604651 | | ZD | 1.21468927 | | DV | 1.24285714 | | AS | 1.27067669 | | PS | 1.31004367 | | ZP | 1.31102362 | | VS | 1.31864407 | | JS | 1.32413793 | | SV | 1.32539683 | | AZ | 1.33000000 | | ZH | 1.33333333 | | SZ | 1.41246684 | | PV | 1.44333333 | | VA | 1.48148148 | | JP | 1.58426966 | | DS | 1.63114754 | | DA | 1.65476190 | | SJ | 1.66433566 | | DP | 1.67073171 | | VD | 1.72486772 | | ZS | 2.08620690 | | SS | 2.32888889 | +--------+------------+ 39 rows in set (1.91 sec) -- Paigutage samade paaride soome ja vene suhtarvud samasse tabeli ritta