library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
ounad=read_csv("http://www.tlu.ee/~jaagup/andmed/muu/ounad/antoonovka2.txt")
## Parsed with column specification:
## cols(
##   august = col_double(),
##   september = col_double()
## )
ounad %>% ggplot(aes(august, september)) + geom_point()

ounad %>% ggplot(aes(august, september)) + geom_point() + 
   xlim(c(0, 10)) + ylim(c(0, 10)) + geom_smooth(method='lm')

lm(september~august, data=ounad)
## 
## Call:
## lm(formula = september ~ august, data = ounad)
## 
## Coefficients:
## (Intercept)       august  
##        1.85         1.01
summary(lm(september~august, data=ounad))
## 
## Call:
## lm(formula = september ~ august, data = ounad)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.90315 -0.39738 -0.09633  0.30839  1.20315 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.84966    0.25183   7.345 6.16e-11 ***
## august       1.01049    0.05145  19.639  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4923 on 98 degrees of freedom
## Multiple R-squared:  0.7974, Adjusted R-squared:  0.7953 
## F-statistic: 385.7 on 1 and 98 DF,  p-value: < 2.2e-16
predict(lm(september~august, data=ounad), tibble(august=c(2,3,4)))
##        1        2        3 
## 3.870636 4.881123 5.891610
#Koostage tibble uute õunadiameetritega augustis
#Arvutage kõrvale ennustatavad diameetrid septembris
#Kuvage need õunad xy-joonisel

uuritavad=tibble(august=rnorm(10, 4, 2))
uuritavad
## # A tibble: 10 x 1
##       august
##        <dbl>
##  1  2.065848
##  2  3.960080
##  3 -0.280374
##  4  4.170930
##  5  2.622794
##  6  4.488023
##  7  5.601334
##  8  4.232374
##  9  3.817080
## 10  1.775218
mudel=lm(september~august, data=ounad)
uuritavad$september=predict(mudel, uuritavad)
uuritavad
## # A tibble: 10 x 2
##       august september
##        <dbl>     <dbl>
##  1  2.065848  3.937174
##  2  3.960080  5.851272
##  3 -0.280374  1.566347
##  4  4.170930  6.064333
##  5  2.622794  4.499961
##  6  4.488023  6.384751
##  7  5.601334  7.509738
##  8  4.232374  6.126422
##  9  3.817080  5.706772
## 10  1.775218  3.643496
uuritavad %>% ggplot(aes(august, september))+ ggtitle("Ennustus") +
  xlab("augusti diameeter")+ geom_point() + 
  theme(plot.title=element_text(hjust=0.5))

ounad=read_csv("http://www.tlu.ee/~jaagup/andmed/muu/ounad/antoonovka3.txt")
## Parsed with column specification:
## cols(
##   august = col_double(),
##   september = col_double(),
##   oktoober = col_double()
## )
#Koostage joonis, kus ühel teljel on õunte diameetri kasvu suurus augustist septembrini ning
#teisel teljel septembrist oktoobrini

ounad %>% ggplot(aes(september-august, oktoober-september)) + geom_point()

ounad %>% ggplot(aes(september-august, oktoober-september)) + geom_point() +
   geom_smooth()
## `geom_smooth()` using method = 'loess'

ounad %>% mutate(vahe1=september-august, vahe2=oktoober-september) %>% 
    colMeans()
##    august september  oktoober     vahe1     vahe2 
##     4.800     6.700     7.742     1.900     1.042
ounad %>% mutate(vahe1=september-august, vahe2=oktoober-september) %>%
    select(vahe1, vahe2) -> vahed
vahed
## # A tibble: 100 x 2
##    vahe1 vahe2
##    <dbl> <dbl>
##  1   1.9   0.8
##  2   1.7   0.8
##  3   1.4   0.3
##  4   1.3   1.4
##  5   2.2   1.3
##  6   1.7   1.1
##  7   1.6   1.9
##  8   2.3   1.3
##  9   2.2   1.2
## 10   2.5   1.2
## # ... with 90 more rows
lm(vahe2~vahe1, data=vahed)
## 
## Call:
## lm(formula = vahe2 ~ vahe1, data = vahed)
## 
## Coefficients:
## (Intercept)        vahe1  
##       1.319       -0.146
summary(lm(vahe2~vahe1, data=vahed))
## 
## Call:
## lm(formula = vahe2 ~ vahe1, data = vahed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95660 -0.37851  0.02768  0.31642  1.57484 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.3195     0.2053   6.428  4.7e-09 ***
## vahe1        -0.1460     0.1046  -1.396    0.166    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5101 on 98 degrees of freedom
## Multiple R-squared:  0.01949,    Adjusted R-squared:  0.009483 
## F-statistic: 1.948 on 1 and 98 DF,  p-value: 0.166
ounad
## # A tibble: 100 x 3
##    august september oktoober
##     <dbl>     <dbl>    <dbl>
##  1    6.0       7.9      8.7
##  2    4.0       5.7      6.5
##  3    5.2       6.6      6.9
##  4    4.1       5.4      6.8
##  5    5.7       7.9      9.2
##  6    4.2       5.9      7.0
##  7    7.0       8.6     10.5
##  8    6.8       9.1     10.4
##  9    4.1       6.3      7.5
## 10    5.5       8.0      9.2
## # ... with 90 more rows
lm(oktoober~august+september, data=ounad)
## 
## Call:
## lm(formula = oktoober ~ august + september, data = ounad)
## 
## Coefficients:
## (Intercept)       august    september  
##      0.8322       0.2534       0.8498
summary(lm(oktoober~august+september, data=ounad))
## 
## Call:
## lm(formula = oktoober ~ august + september, data = ounad)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.01892 -0.39587  0.05117  0.30725  1.54722 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.8322     0.3203   2.598   0.0108 *  
## august        0.2534     0.1168   2.170   0.0324 *  
## september     0.8498     0.1032   8.236 8.42e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5028 on 97 degrees of freedom
## Multiple R-squared:  0.8417, Adjusted R-squared:  0.8384 
## F-statistic: 257.8 on 2 and 97 DF,  p-value: < 2.2e-16
uuritavad=tibble(august=rnorm(10, 4, 1.5))
uuritavad$september=uuritavad$august+runif(10, 0.5, 1)
uuritavad
## # A tibble: 10 x 2
##      august september
##       <dbl>     <dbl>
##  1 3.769017  4.665684
##  2 4.541267  5.123095
##  3 4.261481  4.880260
##  4 8.782933  9.558034
##  5 3.876861  4.829180
##  6 4.121002  4.994985
##  7 2.759191  3.265101
##  8 5.580773  6.234512
##  9 3.579306  4.112640
## 10 3.602178  4.288434
mudel=lm(oktoober~august+september, data=ounad)
uuritavad$oktoober=predict(mudel, uuritavad)
uuritavad
## # A tibble: 10 x 3
##      august september  oktoober
##       <dbl>     <dbl>     <dbl>
##  1 3.769017  4.665684  5.752030
##  2 4.541267  5.123095  6.336408
##  3 4.261481  4.880260  6.059156
##  4 8.782933  9.558034 11.179928
##  5 3.876861  4.829180  5.918292
##  6 4.121002  4.994985  6.121053
##  7 2.759191  3.265101  4.305960
##  8 5.580773  6.234512  7.544268
##  9 3.579306  4.112640  5.233991
## 10 3.602178  4.288434  5.389174
ounad=read_csv("http://www.tlu.ee/~jaagup/andmed/muu/ounad/liivi_antoonovka_aug_sept_1000.txt")
## Parsed with column specification:
## cols(
##   ounasort = col_character(),
##   august = col_double(),
##   september = col_double()
## )
#Kuvage õunad XY-joonisele, sordi järgi eri värvi

ounad %>% ggplot(aes(x=august, y=september, color=ounasort)) + geom_point()

ounad %>% filter(ounasort=="Antoonovka") %>% ggplot(aes(ounasort, august)) + geom_boxplot()

arvud=read_csv("http://www.tlu.ee/~jaagup/andmed/keel/korpus/dokarvud.txt")
## Parsed with column specification:
## cols(
##   kood = col_character(),
##   tahti = col_integer(),
##   sonu = col_integer(),
##   lauseid = col_integer(),
##   vigu = col_integer(),
##   veatyype = col_integer(),
##   kolmetahelistepr = col_double(),
##   viietahelistepr = col_double(),
##   kymnejarohkemtahelistepr = col_double(),
##   kahesonalistepr = col_double(),
##   kolmesonalistepr = col_double(),
##   kuuekuni9sonalistepr = col_double(),
##   kymnekuni20sonalistepr = col_double()
## )
dokmeta=read_csv("http://www.tlu.ee/~jaagup/andmed/keel/korpus/dokmeta.txt")
## Parsed with column specification:
## cols(
##   kood = col_character(),
##   korpus = col_character(),
##   tekstikeel = col_character(),
##   tekstityyp = col_character(),
##   elukoht = col_character(),
##   taust = col_character(),
##   vanus = col_character(),
##   sugu = col_character(),
##   emakeel = col_character(),
##   kodukeel = col_character(),
##   keeletase = col_character(),
##   haridus = col_character(),
##   abivahendid = col_character()
## )
#Leidke kuni 18-aastaste autorite tekstide tähtede arvu summa
#Joonistage karpdiagramm teksti tähtede arvu jaotusega

str(arvud %>% inner_join(dokmeta))
## Joining, by = "kood"
## Classes 'tbl_df', 'tbl' and 'data.frame':    12724 obs. of  25 variables:
##  $ kood                    : chr  "doc_100636852915_item" "doc_100636852916_item" "doc_100636852917_item" "doc_1010138197_item" ...
##  $ tahti                   : int  1156 661 936 4146 4145 4215 4058 46245 0 2301 ...
##  $ sonu                    : int  181 101 128 559 565 570 550 6181 0 414 ...
##  $ lauseid                 : int  14 6 12 83 80 84 83 536 0 43 ...
##  $ vigu                    : int  39 19 22 49 62 44 52 0 0 0 ...
##  $ veatyype                : int  22 11 15 15 26 16 21 0 0 0 ...
##  $ kolmetahelistepr        : num  12.71 10 13.74 6.08 5.66 ...
##  $ viietahelistepr         : num  14.9 13 11.4 13.1 13.6 ...
##  $ kymnejarohkemtahelistepr: num  4.98 8 12.97 10.2 9.73 ...
##  $ kahesonalistepr         : num  0 0 0 0 0 0 0 9.09 0 0 ...
##  $ kolmesonalistepr        : num  0 0 9.09 1.25 1.33 1.25 1.27 3.95 0 0 ...
##  $ kuuekuni9sonalistepr    : num  37.5 20 36.4 32.5 36 ...
##  $ kymnekuni20sonalistepr  : num  75 100 45.5 16.2 16 ...
##  $ korpus                  : chr  "cFOoRQekA" "cFOoRQekA" "cFOoRQekA" "cFOoRQekA" ...
##  $ tekstikeel              : chr  "eesti" "eesti" "eesti" "eesti" ...
##  $ tekstityyp              : chr  "essee" "muu" "essee" "muu" ...
##  $ elukoht                 : chr  "idaviru" "idaviru" "idaviru" "tallinn" ...
##  $ taust                   : chr  "op" "op" "op" "ylop" ...
##  $ vanus                   : chr  "kuni18" "kuni18" "kuni18" "kuni26" ...
##  $ sugu                    : chr  "naine" "naine" "naine" "naine" ...
##  $ emakeel                 : chr  "vene" "vene" "vene" "vene" ...
##  $ kodukeel                : chr  "vene" "vene" "vene" "vene" ...
##  $ keeletase               : chr  "B" "B" "B" "A" ...
##  $ haridus                 : chr  "pohi" "pohi" "pohi" "kesk" ...
##  $ abivahendid             : chr  "ei" "ei" "ei" "ei" ...
arvud %>% inner_join(dokmeta) %>% filter(vanus=="kuni18") %>% .$tahti %>% sum()
## Joining, by = "kood"
## [1] 1678825
arvud %>% inner_join(dokmeta) %>% ggplot(aes(vanus, tahti)) + geom_boxplot()
## Joining, by = "kood"

arvud %>% inner_join(dokmeta) %>% filter(vanus=="kuni18") %>% filter(tahti>0) %>% 
    ggplot(aes(vanus, tahti)) + geom_boxplot()
## Joining, by = "kood"

arvud %>% inner_join(dokmeta) %>% filter(vanus=="kuni18") %>% filter(tahti>0) %>% 
   .$tahti %>% min()
## Joining, by = "kood"
## [1] 4
#Püüdke leida seos või selle puudumine teksti tähtede arvu ning üle kümnetäheliste
#sõnade protsendi vahel. 

#Pakkuge predicti abili üle kümnetäheliste sõnade protsent 20 juhuslikult valitud
#teksti juures ning näidake kõrvale ka tegelikke väärtusi

arvud %>% ggplot(aes(tahti, kymnejarohkemtahelistepr)) + geom_point()

mean(arvud$kymnejarohkemtahelistepr)
## [1] 8.641552
arvud %>% ggplot(aes(tahti, kymnejarohkemtahelistepr)) + geom_point()+
   geom_hline(yintercept = mean(arvud$kymnejarohkemtahelistepr))

lm(kymnejarohkemtahelistepr~tahti, data=arvud)
## 
## Call:
## lm(formula = kymnejarohkemtahelistepr ~ tahti, data = arvud)
## 
## Coefficients:
## (Intercept)        tahti  
##   7.3153583    0.0007273
summary(lm(kymnejarohkemtahelistepr~tahti, data=arvud))
## 
## Call:
## lm(formula = kymnejarohkemtahelistepr ~ tahti, data = arvud)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.596  -2.809  -0.586   2.275  92.677 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.315e+00  5.202e-02  140.63   <2e-16 ***
## tahti       7.273e-04  1.781e-05   40.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.583 on 12722 degrees of freedom
## Multiple R-squared:  0.1158, Adjusted R-squared:  0.1158 
## F-statistic:  1667 on 1 and 12722 DF,  p-value: < 2.2e-16
sample(arvud$tahti, 20)
##  [1] 1723 3344  648 1932  675 2117 2514  849  949 2878  605 1374  538  716
## [15] 2298 1179  645 1241  674 1675
uuritav=sample_n(arvud, 20)

mudel=lm(kymnejarohkemtahelistepr~tahti, data=arvud)
uuritav$uusprotsent=predict(mudel, uuritav)

uuritav %>% select(tahti, kymnejarohkemtahelistepr, uusprotsent) -> tulemus
tulemus
## # A tibble: 20 x 3
##    tahti kymnejarohkemtahelistepr uusprotsent
##    <int>                    <dbl>       <dbl>
##  1  1621                     3.44    8.494272
##  2   707                     9.58    7.829542
##  3   597                     7.06    7.749542
##  4   623                    11.90    7.768451
##  5   620                     2.28    7.766269
##  6   750                     9.99    7.860815
##  7   820                     6.90    7.911724
##  8  1156                     7.43    8.156089
##  9  1498                     6.46    8.404817
## 10   609                     3.33    7.758269
## 11   746                     6.19    7.857906
## 12   613                     8.87    7.761178
## 13  2930                    10.73    9.446275
## 14   799                    10.09    7.896451
## 15   467                     5.40    7.654996
## 16  6583                    12.07   12.103012
## 17   569                     5.07    7.729178
## 18  2596                    12.53    9.203365
## 19  2553                    13.22    9.172092
## 20   738                     4.55    7.852087
tulemus %>% mutate(vaheuuega=abs(kymnejarohkemtahelistepr-uusprotsent),
        vahekeskmisega=abs(kymnejarohkemtahelistepr-mean(arvud$kymnejarohkemtahelistepr)))
## # A tibble: 20 x 5
##    tahti kymnejarohkemtahelistepr uusprotsent  vaheuuega vahekeskmisega
##    <int>                    <dbl>       <dbl>      <dbl>          <dbl>
##  1  1621                     3.44    8.494272 5.05427156      5.2015522
##  2   707                     9.58    7.829542 1.75045805      0.9384478
##  3   597                     7.06    7.749542 0.68954167      1.5815522
##  4   623                    11.90    7.768451 4.13154918      3.2584478
##  5   620                     2.28    7.766269 5.48626900      6.3615522
##  6   750                     9.99    7.860815 2.12918522      1.3484478
##  7   820                     6.90    7.911724 1.01172406      1.7415522
##  8  1156                     7.43    8.156089 0.72608855      1.2115522
##  9  1498                     6.46    8.404817 1.94481670      2.1815522
## 10   609                     3.33    7.758269 4.42826897      5.3115522
## 11   746                     6.19    7.857906 1.66790568      2.4515522
## 12   613                     8.87    7.761178 1.10882193      0.2284478
## 13  2930                    10.73    9.446275 1.28372508      2.0884478
## 14   799                    10.09    7.896451 2.19354873      1.4484478
## 15   467                     5.40    7.654996 2.25499588      3.2415522
## 16  6583                    12.07   12.103012 0.03301157      3.4284478
## 17   569                     5.07    7.729178 2.65917796      3.5715522
## 18  2596                    12.53    9.203365 3.32663503      3.8884478
## 19  2553                    13.22    9.172092 4.04790786      4.5784478
## 20   738                     4.55    7.852087 3.30208748      4.0915522
tulemus %>% mutate(vaheuuega=abs(kymnejarohkemtahelistepr-uusprotsent),
          vahekeskmisega=abs(kymnejarohkemtahelistepr-mean(arvud$kymnejarohkemtahelistepr))) %>%
          summarise(uuesumma=sum(vaheuuega), keskmisesumma=sum(vahekeskmisega))
## # A tibble: 1 x 2
##   uuesumma keskmisesumma
##      <dbl>         <dbl>
## 1 49.22999       58.1531