library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
ounad=read_csv("http://www.tlu.ee/~jaagup/andmed/muu/ounad/antoonovka2.txt")
## Parsed with column specification:
## cols(
## august = col_double(),
## september = col_double()
## )
ounad %>% ggplot(aes(august, september)) + geom_point()
ounad %>% ggplot(aes(august, september)) + geom_point() +
xlim(c(0, 10)) + ylim(c(0, 10)) + geom_smooth(method='lm')
lm(september~august, data=ounad)
##
## Call:
## lm(formula = september ~ august, data = ounad)
##
## Coefficients:
## (Intercept) august
## 1.85 1.01
summary(lm(september~august, data=ounad))
##
## Call:
## lm(formula = september ~ august, data = ounad)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.90315 -0.39738 -0.09633 0.30839 1.20315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.84966 0.25183 7.345 6.16e-11 ***
## august 1.01049 0.05145 19.639 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4923 on 98 degrees of freedom
## Multiple R-squared: 0.7974, Adjusted R-squared: 0.7953
## F-statistic: 385.7 on 1 and 98 DF, p-value: < 2.2e-16
predict(lm(september~august, data=ounad), tibble(august=c(2,3,4)))
## 1 2 3
## 3.870636 4.881123 5.891610
#Koostage tibble uute õunadiameetritega augustis
#Arvutage kõrvale ennustatavad diameetrid septembris
#Kuvage need õunad xy-joonisel
uuritavad=tibble(august=rnorm(10, 4, 2))
uuritavad
## # A tibble: 10 x 1
## august
## <dbl>
## 1 2.065848
## 2 3.960080
## 3 -0.280374
## 4 4.170930
## 5 2.622794
## 6 4.488023
## 7 5.601334
## 8 4.232374
## 9 3.817080
## 10 1.775218
mudel=lm(september~august, data=ounad)
uuritavad$september=predict(mudel, uuritavad)
uuritavad
## # A tibble: 10 x 2
## august september
## <dbl> <dbl>
## 1 2.065848 3.937174
## 2 3.960080 5.851272
## 3 -0.280374 1.566347
## 4 4.170930 6.064333
## 5 2.622794 4.499961
## 6 4.488023 6.384751
## 7 5.601334 7.509738
## 8 4.232374 6.126422
## 9 3.817080 5.706772
## 10 1.775218 3.643496
uuritavad %>% ggplot(aes(august, september))+ ggtitle("Ennustus") +
xlab("augusti diameeter")+ geom_point() +
theme(plot.title=element_text(hjust=0.5))
ounad=read_csv("http://www.tlu.ee/~jaagup/andmed/muu/ounad/antoonovka3.txt")
## Parsed with column specification:
## cols(
## august = col_double(),
## september = col_double(),
## oktoober = col_double()
## )
#Koostage joonis, kus ühel teljel on õunte diameetri kasvu suurus augustist septembrini ning
#teisel teljel septembrist oktoobrini
ounad %>% ggplot(aes(september-august, oktoober-september)) + geom_point()
ounad %>% ggplot(aes(september-august, oktoober-september)) + geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess'
ounad %>% mutate(vahe1=september-august, vahe2=oktoober-september) %>%
colMeans()
## august september oktoober vahe1 vahe2
## 4.800 6.700 7.742 1.900 1.042
ounad %>% mutate(vahe1=september-august, vahe2=oktoober-september) %>%
select(vahe1, vahe2) -> vahed
vahed
## # A tibble: 100 x 2
## vahe1 vahe2
## <dbl> <dbl>
## 1 1.9 0.8
## 2 1.7 0.8
## 3 1.4 0.3
## 4 1.3 1.4
## 5 2.2 1.3
## 6 1.7 1.1
## 7 1.6 1.9
## 8 2.3 1.3
## 9 2.2 1.2
## 10 2.5 1.2
## # ... with 90 more rows
lm(vahe2~vahe1, data=vahed)
##
## Call:
## lm(formula = vahe2 ~ vahe1, data = vahed)
##
## Coefficients:
## (Intercept) vahe1
## 1.319 -0.146
summary(lm(vahe2~vahe1, data=vahed))
##
## Call:
## lm(formula = vahe2 ~ vahe1, data = vahed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95660 -0.37851 0.02768 0.31642 1.57484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3195 0.2053 6.428 4.7e-09 ***
## vahe1 -0.1460 0.1046 -1.396 0.166
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5101 on 98 degrees of freedom
## Multiple R-squared: 0.01949, Adjusted R-squared: 0.009483
## F-statistic: 1.948 on 1 and 98 DF, p-value: 0.166
ounad
## # A tibble: 100 x 3
## august september oktoober
## <dbl> <dbl> <dbl>
## 1 6.0 7.9 8.7
## 2 4.0 5.7 6.5
## 3 5.2 6.6 6.9
## 4 4.1 5.4 6.8
## 5 5.7 7.9 9.2
## 6 4.2 5.9 7.0
## 7 7.0 8.6 10.5
## 8 6.8 9.1 10.4
## 9 4.1 6.3 7.5
## 10 5.5 8.0 9.2
## # ... with 90 more rows
lm(oktoober~august+september, data=ounad)
##
## Call:
## lm(formula = oktoober ~ august + september, data = ounad)
##
## Coefficients:
## (Intercept) august september
## 0.8322 0.2534 0.8498
summary(lm(oktoober~august+september, data=ounad))
##
## Call:
## lm(formula = oktoober ~ august + september, data = ounad)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01892 -0.39587 0.05117 0.30725 1.54722
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8322 0.3203 2.598 0.0108 *
## august 0.2534 0.1168 2.170 0.0324 *
## september 0.8498 0.1032 8.236 8.42e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5028 on 97 degrees of freedom
## Multiple R-squared: 0.8417, Adjusted R-squared: 0.8384
## F-statistic: 257.8 on 2 and 97 DF, p-value: < 2.2e-16
uuritavad=tibble(august=rnorm(10, 4, 1.5))
uuritavad$september=uuritavad$august+runif(10, 0.5, 1)
uuritavad
## # A tibble: 10 x 2
## august september
## <dbl> <dbl>
## 1 3.769017 4.665684
## 2 4.541267 5.123095
## 3 4.261481 4.880260
## 4 8.782933 9.558034
## 5 3.876861 4.829180
## 6 4.121002 4.994985
## 7 2.759191 3.265101
## 8 5.580773 6.234512
## 9 3.579306 4.112640
## 10 3.602178 4.288434
mudel=lm(oktoober~august+september, data=ounad)
uuritavad$oktoober=predict(mudel, uuritavad)
uuritavad
## # A tibble: 10 x 3
## august september oktoober
## <dbl> <dbl> <dbl>
## 1 3.769017 4.665684 5.752030
## 2 4.541267 5.123095 6.336408
## 3 4.261481 4.880260 6.059156
## 4 8.782933 9.558034 11.179928
## 5 3.876861 4.829180 5.918292
## 6 4.121002 4.994985 6.121053
## 7 2.759191 3.265101 4.305960
## 8 5.580773 6.234512 7.544268
## 9 3.579306 4.112640 5.233991
## 10 3.602178 4.288434 5.389174
ounad=read_csv("http://www.tlu.ee/~jaagup/andmed/muu/ounad/liivi_antoonovka_aug_sept_1000.txt")
## Parsed with column specification:
## cols(
## ounasort = col_character(),
## august = col_double(),
## september = col_double()
## )
#Kuvage õunad XY-joonisele, sordi järgi eri värvi
ounad %>% ggplot(aes(x=august, y=september, color=ounasort)) + geom_point()
ounad %>% filter(ounasort=="Antoonovka") %>% ggplot(aes(ounasort, august)) + geom_boxplot()
arvud=read_csv("http://www.tlu.ee/~jaagup/andmed/keel/korpus/dokarvud.txt")
## Parsed with column specification:
## cols(
## kood = col_character(),
## tahti = col_integer(),
## sonu = col_integer(),
## lauseid = col_integer(),
## vigu = col_integer(),
## veatyype = col_integer(),
## kolmetahelistepr = col_double(),
## viietahelistepr = col_double(),
## kymnejarohkemtahelistepr = col_double(),
## kahesonalistepr = col_double(),
## kolmesonalistepr = col_double(),
## kuuekuni9sonalistepr = col_double(),
## kymnekuni20sonalistepr = col_double()
## )
dokmeta=read_csv("http://www.tlu.ee/~jaagup/andmed/keel/korpus/dokmeta.txt")
## Parsed with column specification:
## cols(
## kood = col_character(),
## korpus = col_character(),
## tekstikeel = col_character(),
## tekstityyp = col_character(),
## elukoht = col_character(),
## taust = col_character(),
## vanus = col_character(),
## sugu = col_character(),
## emakeel = col_character(),
## kodukeel = col_character(),
## keeletase = col_character(),
## haridus = col_character(),
## abivahendid = col_character()
## )
#Leidke kuni 18-aastaste autorite tekstide tähtede arvu summa
#Joonistage karpdiagramm teksti tähtede arvu jaotusega
str(arvud %>% inner_join(dokmeta))
## Joining, by = "kood"
## Classes 'tbl_df', 'tbl' and 'data.frame': 12724 obs. of 25 variables:
## $ kood : chr "doc_100636852915_item" "doc_100636852916_item" "doc_100636852917_item" "doc_1010138197_item" ...
## $ tahti : int 1156 661 936 4146 4145 4215 4058 46245 0 2301 ...
## $ sonu : int 181 101 128 559 565 570 550 6181 0 414 ...
## $ lauseid : int 14 6 12 83 80 84 83 536 0 43 ...
## $ vigu : int 39 19 22 49 62 44 52 0 0 0 ...
## $ veatyype : int 22 11 15 15 26 16 21 0 0 0 ...
## $ kolmetahelistepr : num 12.71 10 13.74 6.08 5.66 ...
## $ viietahelistepr : num 14.9 13 11.4 13.1 13.6 ...
## $ kymnejarohkemtahelistepr: num 4.98 8 12.97 10.2 9.73 ...
## $ kahesonalistepr : num 0 0 0 0 0 0 0 9.09 0 0 ...
## $ kolmesonalistepr : num 0 0 9.09 1.25 1.33 1.25 1.27 3.95 0 0 ...
## $ kuuekuni9sonalistepr : num 37.5 20 36.4 32.5 36 ...
## $ kymnekuni20sonalistepr : num 75 100 45.5 16.2 16 ...
## $ korpus : chr "cFOoRQekA" "cFOoRQekA" "cFOoRQekA" "cFOoRQekA" ...
## $ tekstikeel : chr "eesti" "eesti" "eesti" "eesti" ...
## $ tekstityyp : chr "essee" "muu" "essee" "muu" ...
## $ elukoht : chr "idaviru" "idaviru" "idaviru" "tallinn" ...
## $ taust : chr "op" "op" "op" "ylop" ...
## $ vanus : chr "kuni18" "kuni18" "kuni18" "kuni26" ...
## $ sugu : chr "naine" "naine" "naine" "naine" ...
## $ emakeel : chr "vene" "vene" "vene" "vene" ...
## $ kodukeel : chr "vene" "vene" "vene" "vene" ...
## $ keeletase : chr "B" "B" "B" "A" ...
## $ haridus : chr "pohi" "pohi" "pohi" "kesk" ...
## $ abivahendid : chr "ei" "ei" "ei" "ei" ...
arvud %>% inner_join(dokmeta) %>% filter(vanus=="kuni18") %>% .$tahti %>% sum()
## Joining, by = "kood"
## [1] 1678825
arvud %>% inner_join(dokmeta) %>% ggplot(aes(vanus, tahti)) + geom_boxplot()
## Joining, by = "kood"
arvud %>% inner_join(dokmeta) %>% filter(vanus=="kuni18") %>% filter(tahti>0) %>%
ggplot(aes(vanus, tahti)) + geom_boxplot()
## Joining, by = "kood"
arvud %>% inner_join(dokmeta) %>% filter(vanus=="kuni18") %>% filter(tahti>0) %>%
.$tahti %>% min()
## Joining, by = "kood"
## [1] 4
#Püüdke leida seos või selle puudumine teksti tähtede arvu ning üle kümnetäheliste
#sõnade protsendi vahel.
#Pakkuge predicti abili üle kümnetäheliste sõnade protsent 20 juhuslikult valitud
#teksti juures ning näidake kõrvale ka tegelikke väärtusi
arvud %>% ggplot(aes(tahti, kymnejarohkemtahelistepr)) + geom_point()
mean(arvud$kymnejarohkemtahelistepr)
## [1] 8.641552
arvud %>% ggplot(aes(tahti, kymnejarohkemtahelistepr)) + geom_point()+
geom_hline(yintercept = mean(arvud$kymnejarohkemtahelistepr))
lm(kymnejarohkemtahelistepr~tahti, data=arvud)
##
## Call:
## lm(formula = kymnejarohkemtahelistepr ~ tahti, data = arvud)
##
## Coefficients:
## (Intercept) tahti
## 7.3153583 0.0007273
summary(lm(kymnejarohkemtahelistepr~tahti, data=arvud))
##
## Call:
## lm(formula = kymnejarohkemtahelistepr ~ tahti, data = arvud)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.596 -2.809 -0.586 2.275 92.677
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.315e+00 5.202e-02 140.63 <2e-16 ***
## tahti 7.273e-04 1.781e-05 40.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.583 on 12722 degrees of freedom
## Multiple R-squared: 0.1158, Adjusted R-squared: 0.1158
## F-statistic: 1667 on 1 and 12722 DF, p-value: < 2.2e-16
sample(arvud$tahti, 20)
## [1] 1723 3344 648 1932 675 2117 2514 849 949 2878 605 1374 538 716
## [15] 2298 1179 645 1241 674 1675
uuritav=sample_n(arvud, 20)
mudel=lm(kymnejarohkemtahelistepr~tahti, data=arvud)
uuritav$uusprotsent=predict(mudel, uuritav)
uuritav %>% select(tahti, kymnejarohkemtahelistepr, uusprotsent) -> tulemus
tulemus
## # A tibble: 20 x 3
## tahti kymnejarohkemtahelistepr uusprotsent
## <int> <dbl> <dbl>
## 1 1621 3.44 8.494272
## 2 707 9.58 7.829542
## 3 597 7.06 7.749542
## 4 623 11.90 7.768451
## 5 620 2.28 7.766269
## 6 750 9.99 7.860815
## 7 820 6.90 7.911724
## 8 1156 7.43 8.156089
## 9 1498 6.46 8.404817
## 10 609 3.33 7.758269
## 11 746 6.19 7.857906
## 12 613 8.87 7.761178
## 13 2930 10.73 9.446275
## 14 799 10.09 7.896451
## 15 467 5.40 7.654996
## 16 6583 12.07 12.103012
## 17 569 5.07 7.729178
## 18 2596 12.53 9.203365
## 19 2553 13.22 9.172092
## 20 738 4.55 7.852087
tulemus %>% mutate(vaheuuega=abs(kymnejarohkemtahelistepr-uusprotsent),
vahekeskmisega=abs(kymnejarohkemtahelistepr-mean(arvud$kymnejarohkemtahelistepr)))
## # A tibble: 20 x 5
## tahti kymnejarohkemtahelistepr uusprotsent vaheuuega vahekeskmisega
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1621 3.44 8.494272 5.05427156 5.2015522
## 2 707 9.58 7.829542 1.75045805 0.9384478
## 3 597 7.06 7.749542 0.68954167 1.5815522
## 4 623 11.90 7.768451 4.13154918 3.2584478
## 5 620 2.28 7.766269 5.48626900 6.3615522
## 6 750 9.99 7.860815 2.12918522 1.3484478
## 7 820 6.90 7.911724 1.01172406 1.7415522
## 8 1156 7.43 8.156089 0.72608855 1.2115522
## 9 1498 6.46 8.404817 1.94481670 2.1815522
## 10 609 3.33 7.758269 4.42826897 5.3115522
## 11 746 6.19 7.857906 1.66790568 2.4515522
## 12 613 8.87 7.761178 1.10882193 0.2284478
## 13 2930 10.73 9.446275 1.28372508 2.0884478
## 14 799 10.09 7.896451 2.19354873 1.4484478
## 15 467 5.40 7.654996 2.25499588 3.2415522
## 16 6583 12.07 12.103012 0.03301157 3.4284478
## 17 569 5.07 7.729178 2.65917796 3.5715522
## 18 2596 12.53 9.203365 3.32663503 3.8884478
## 19 2553 13.22 9.172092 4.04790786 4.5784478
## 20 738 4.55 7.852087 3.30208748 4.0915522
tulemus %>% mutate(vaheuuega=abs(kymnejarohkemtahelistepr-uusprotsent),
vahekeskmisega=abs(kymnejarohkemtahelistepr-mean(arvud$kymnejarohkemtahelistepr))) %>%
summarise(uuesumma=sum(vaheuuega), keskmisesumma=sum(vahekeskmisega))
## # A tibble: 1 x 2
## uuesumma keskmisesumma
## <dbl> <dbl>
## 1 49.22999 58.1531