arvukogumite võrdlemine

poistepikkused=c(172, 174, 173)
tydrukutepikkused=c(167, 169, 171, 170)

t.test(poistepikkused, tydrukutepikkused)

## 
##  Welch Two Sample t-test
## 
## data:  poistepikkused and tydrukutepikkused
## t = 3.638, df = 4.8496, p-value = 0.01575
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.075396 6.424604
## sample estimates:
## mean of x mean of y 
##    173.00    169.25

poistepikkused_jaanuar=c(170, 173, 173)
poistepikkused_mai=c(172, 174, 173)

t.test(poistepikkused_mai, poistepikkused_jaanuar, paired = TRUE)

## 
##  Paired t-test
## 
## data:  poistepikkused_mai and poistepikkused_jaanuar
## t = 1.7321, df = 2, p-value = 0.2254
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.484138  3.484138
## sample estimates:
## mean of the differences 
##                       1

t.test(poistepikkused_mai, poistepikkused_jaanuar)

## 
##  Welch Two Sample t-test
## 
## data:  poistepikkused_mai and poistepikkused_jaanuar
## t = 0.86603, df = 3.2, p-value = 0.4465
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.548197  4.548197
## sample estimates:
## mean of x mean of y 
##       173       172

t.test(poistepikkused_mai, poistepikkused_jaanuar, alternative = "greater")

## 
##  Welch Two Sample t-test
## 
## data:  poistepikkused_mai and poistepikkused_jaanuar
## t = 0.86603, df = 3.2, p-value = 0.2233
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -1.649569       Inf
## sample estimates:
## mean of x mean of y 
##       173       172

t.test(poistepikkused_mai, poistepikkused_jaanuar, alternative = "greater", paired=TRUE)

## 
##  Paired t-test
## 
## data:  poistepikkused_mai and poistepikkused_jaanuar
## t = 1.7321, df = 2, p-value = 0.1127
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.6858545        Inf
## sample estimates:
## mean of the differences 
##                       1

http://www.tlu.ee/~jaagup/andmed/keel/korpus/dokmeta.txt

http://www.tlu.ee/~jaagup/andmed/keel/korpus/doksonaliigid.txt

Illustreerige naiste ja meeste nimisõnade (S) kasutust tekstides

Püüdke T-testiga näidata nende sarnasust või erinevust

library(tidyverse)

## -- Attaching packages ---------------------------------------------------------- tidyverse 1.2.1 --

## <U+221A> ggplot2 3.0.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.6
## <U+221A> tidyr   0.8.1     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0

## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

dokmeta=read_csv("http://www.tlu.ee/~jaagup/andmed/keel/korpus/dokmeta.txt")

## Parsed with column specification:
## cols(
##   kood = col_character(),
##   korpus = col_character(),
##   tekstikeel = col_character(),
##   tekstityyp = col_character(),
##   elukoht = col_character(),
##   taust = col_character(),
##   vanus = col_character(),
##   sugu = col_character(),
##   emakeel = col_character(),
##   kodukeel = col_character(),
##   keeletase = col_character(),
##   haridus = col_character(),
##   abivahendid = col_character()
## )

head(dokmeta %>% na.omit())

## # A tibble: 6 x 13
##   kood  korpus tekstikeel tekstityyp elukoht taust vanus sugu  emakeel
##   <chr> <chr>  <chr>      <chr>      <chr>   <chr> <chr> <chr> <chr>  
## 1 doc_~ cFOoR~ eesti      essee      idaviru op    kuni~ naine vene   
## 2 doc_~ cFOoR~ eesti      muu        idaviru op    kuni~ naine vene   
## 3 doc_~ cFOoR~ eesti      essee      idaviru op    kuni~ naine vene   
## 4 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## 5 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## 6 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## # ... with 4 more variables: kodukeel <chr>, keeletase <chr>,
## #   haridus <chr>, abivahendid <chr>

head(dokmeta %>% filter(!is.na(sugu)))

## # A tibble: 6 x 13
##   kood  korpus tekstikeel tekstityyp elukoht taust vanus sugu  emakeel
##   <chr> <chr>  <chr>      <chr>      <chr>   <chr> <chr> <chr> <chr>  
## 1 doc_~ cFOoR~ eesti      essee      idaviru op    kuni~ naine vene   
## 2 doc_~ cFOoR~ eesti      muu        idaviru op    kuni~ naine vene   
## 3 doc_~ cFOoR~ eesti      essee      idaviru op    kuni~ naine vene   
## 4 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## 5 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## 6 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## # ... with 4 more variables: kodukeel <chr>, keeletase <chr>,
## #   haridus <chr>, abivahendid <chr>

doksonaliigid <-read_csv("http://www.tlu.ee/~jaagup/andmed/keel/korpus/doksonaliigid.txt")

## Parsed with column specification:
## cols(
##   kood = col_character(),
##   A = col_integer(),
##   C = col_integer(),
##   D = col_integer(),
##   G = col_integer(),
##   H = col_integer(),
##   I = col_integer(),
##   J = col_integer(),
##   K = col_integer(),
##   N = col_integer(),
##   P = col_integer(),
##   S = col_integer(),
##   U = col_integer(),
##   V = col_integer(),
##   X = col_integer(),
##   Y = col_integer(),
##   Z = col_integer(),
##   kokku = col_integer()
## )

koos=dokmeta %>% na.omit() %>% inner_join(doksonaliigid, by="kood")
head(koos)

## # A tibble: 6 x 30
##   kood  korpus tekstikeel tekstityyp elukoht taust vanus sugu  emakeel
##   <chr> <chr>  <chr>      <chr>      <chr>   <chr> <chr> <chr> <chr>  
## 1 doc_~ cFOoR~ eesti      essee      idaviru op    kuni~ naine vene   
## 2 doc_~ cFOoR~ eesti      muu        idaviru op    kuni~ naine vene   
## 3 doc_~ cFOoR~ eesti      essee      idaviru op    kuni~ naine vene   
## 4 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## 5 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## 6 doc_~ cFOoR~ eesti      muu        tallinn ylop  kuni~ naine vene   
## # ... with 21 more variables: kodukeel <chr>, keeletase <chr>,
## #   haridus <chr>, abivahendid <chr>, A <int>, C <int>, D <int>, G <int>,
## #   H <int>, I <int>, J <int>, K <int>, N <int>, P <int>, S <int>,
## #   U <int>, V <int>, X <int>, Y <int>, Z <int>, kokku <int>

  meestenimisonad <- koos %>% filter(sugu=="mees") %>% .$S
  naistenimisonad <- koos %>% filter(sugu=="naine") %>% .$S
  t.test(meestenimisonad, naistenimisonad)

## 
##  Welch Two Sample t-test
## 
## data:  meestenimisonad and naistenimisonad
## t = -8.8883, df = 2706.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -44.03450 -28.11719
## sample estimates:
## mean of x mean of y 
##  53.23359  89.30943

  ggplot(koos, aes(sugu, kokku)) + geom_boxplot() + ylim(0, 1000)

## Warning: Removed 76 rows containing non-finite values (stat_boxplot).

 ggplot(koos, aes(kokku, fill=sugu)) + geom_histogram() + xlim(0, 1000)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 76 rows containing non-finite values (stat_bin).

 ggplot(koos, aes(S/kokku)) + geom_histogram() +facet_wrap(~sugu)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite values (stat_bin).

  koos2=koos %>% mutate(nimisonaosakaal=S/kokku) %>% select(sugu, nimisonaosakaal)
  t.test(koos2[koos2$sugu=="mees", ]$nimisonaosakaal, 
         koos2[koos2$sugu=="naine",] $nimisonaosakaal)

## 
##  Welch Two Sample t-test
## 
## data:  koos2[koos2$sugu == "mees", ]$nimisonaosakaal and koos2[koos2$sugu == "naine", ]$nimisonaosakaal
## t = -8.2892, df = 1335.8, p-value = 2.748e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02765178 -0.01706819
## sample estimates:
## mean of x mean of y 
## 0.2331012 0.2554612

t.test(koos2[koos2$sugu=="mees", ]$nimisonaosakaal, mu=0.25)

## 
##  One Sample t-test
## 
## data:  koos2[koos2$sugu == "mees", ]$nimisonaosakaal
## t = -7.2194, df = 789, p-value = 1.229e-12
## alternative hypothesis: true mean is not equal to 0.25
## 95 percent confidence interval:
##  0.2285064 0.2376960
## sample estimates:
## mean of x 
## 0.2331012

t.test(koos2[koos2$sugu=="naine", ]$nimisonaosakaal, mu=0.25, conf.level=0.99)

## 
##  One Sample t-test
## 
## data:  koos2[koos2$sugu == "naine", ]$nimisonaosakaal
## t = 4.0734, df = 2033, p-value = 4.81e-05
## alternative hypothesis: true mean is not equal to 0.25
## 99 percent confidence interval:
##  0.2520046 0.2589178
## sample estimates:
## mean of x 
## 0.2554612

 koos %>% ggplot(aes(keeletase, S/kokku)) + geom_boxplot()

## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

koos2=koos %>% mutate(nimisonaosakaal=S/kokku) %>% select(keeletase, nimisonaosakaal)
aov(koos2$nimisonaosakaal ~ koos2$keeletase)

## Call:
##    aov(formula = koos2$nimisonaosakaal ~ koos2$keeletase)
## 
## Terms:
##                 koos2$keeletase Residuals
## Sum of Squares         0.558411 10.573804
## Deg. of Freedom               8      2815
## 
## Residual standard error: 0.06128814
## Estimated effects may be unbalanced
## 4 observations deleted due to missingness

summary(aov(koos2$nimisonaosakaal ~ koos2$keeletase))

##                   Df Sum Sq Mean Sq F value Pr(>F)    
## koos2$keeletase    8  0.558 0.06980   18.58 <2e-16 ***
## Residuals       2815 10.574 0.00376                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 4 observations deleted due to missingness

TukeyHSD(aov(koos2$nimisonaosakaal ~ koos2$keeletase))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = koos2$nimisonaosakaal ~ koos2$keeletase)
## 
## $`koos2$keeletase`
##                diff          lwr           upr     p adj
## A1-A   0.0997530619 -0.090585642  0.2900917662 0.7901502
## A2-A   0.0272170012 -0.001147260  0.0555812627 0.0717962
## B-A    0.0191631008  0.010881574  0.0274446278 0.0000000
## B1-A  -0.0047398816 -0.019908381  0.0104286175 0.9884950
## B2-A  -0.0037594995 -0.025330859  0.0178118605 0.9998209
## C-A    0.0390842856  0.027380794  0.0507877770 0.0000000
## C1-A   0.0058461926 -0.024090559  0.0357829442 0.9995808
## C2-A  -0.0195671021 -0.209905806  0.1707716023 0.9999969
## A2-A1 -0.0725360606 -0.264797543  0.1197254218 0.9624843
## B-A1  -0.0805899611 -0.270927897  0.1097479752 0.9273383
## B1-A1 -0.1044929435 -0.295254653  0.0862687656 0.7465281
## B2-A1 -0.1035125613 -0.294889858  0.0878647358 0.7595775
## C-A1  -0.0606687762 -0.251186277  0.1298487243 0.9869842
## C1-A1 -0.0939068693 -0.286406624  0.0985928850 0.8491500
## C2-A1 -0.1193201640 -0.388371774  0.1497314457 0.9067389
## B-A2  -0.0080539005 -0.036413007  0.0203052062 0.9939236
## B1-A2 -0.0319568828 -0.063033218 -0.0008805477 0.0383189
## B2-A2 -0.0309765007 -0.065631642  0.0036786405 0.1233301
## C-A2   0.0118672844 -0.017672975  0.0414075442 0.9458705
## C1-A2 -0.0213708087 -0.061767149  0.0190255320 0.7813449
## C2-A2 -0.0467841033 -0.239045586  0.1454773791 0.9979257
## B1-B  -0.0239029824 -0.039061840 -0.0087441247 0.0000365
## B2-B  -0.0229226002 -0.044487182 -0.0013580188 0.0273001
## C-B    0.0199211848  0.008230192  0.0316121776 0.0000047
## C1-B  -0.0133169082 -0.043248776  0.0166149594 0.9051502
## C2-B  -0.0387302029 -0.229068139  0.1516077334 0.9994319
## B2-B1  0.0009803821 -0.024050216  0.0260109804 1.0000000
## C-B1   0.0438241672  0.026556307  0.0610920272 0.0000000
## C1-B1  0.0105860741 -0.021931865  0.0431040134 0.9849859
## C2-B1 -0.0148272205 -0.205588930  0.1759344886 0.9999997
## C-B2   0.0428437851  0.019747915  0.0659396547 0.0000003
## C1-B2  0.0096056920 -0.026347842  0.0455592257 0.9960039
## C2-B2 -0.0158076027 -0.207184900  0.1755696945 0.9999995
## C1-C  -0.0332380931 -0.064291349 -0.0021848368 0.0253479
## C2-C  -0.0586513877 -0.249168888  0.1318661128 0.9895706
## C2-C1 -0.0254132947 -0.217913049  0.1670864596 0.9999785

koos2=koos %>% filter(keeletase %in% c("A2", "B1", "B2", "C1")) %>% mutate(nimisonaosakaal=S/kokku) %>% select(keeletase, nimisonaosakaal)
aov(koos2$nimisonaosakaal ~ koos2$keeletase)

## Call:
##    aov(formula = koos2$nimisonaosakaal ~ koos2$keeletase)
## 
## Terms:
##                 koos2$keeletase Residuals
## Sum of Squares         0.041287  1.146924
## Deg. of Freedom               3       354
## 
## Residual standard error: 0.0569201
## Estimated effects may be unbalanced
## 1 observation deleted due to missingness

summary(aov(koos2$nimisonaosakaal ~ koos2$keeletase))

##                  Df Sum Sq Mean Sq F value  Pr(>F)   
## koos2$keeletase   3 0.0413 0.01376   4.248 0.00575 **
## Residuals       354 1.1469 0.00324                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness

TukeyHSD(aov(koos2$nimisonaosakaal ~ koos2$keeletase))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = koos2$nimisonaosakaal ~ koos2$keeletase)
## 
## $`koos2$keeletase`
##                diff         lwr          upr     p adj
## B1-A2 -0.0319568828 -0.05595677 -0.007956995 0.0036624
## B2-A2 -0.0309765007 -0.05774026 -0.004212743 0.0158480
## C1-A2 -0.0213708087 -0.05256843  0.009826810 0.2904491
## B2-B1  0.0009803821 -0.01835045  0.020311219 0.9991977
## C1-B1  0.0105860741 -0.01452715  0.035699296 0.6970545
## C1-B2  0.0096056920 -0.01816080  0.037372183 0.8085111

arvuvordlused

arvukogumite võrdlemine

http://www.tlu.ee/~jaagup/andmed/keel/korpus/dokmeta.txt

http://www.tlu.ee/~jaagup/andmed/keel/korpus/doksonaliigid.txt