library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.2.1 --
## <U+221A> ggplot2 3.0.0     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.6
## <U+221A> tidyr   0.8.1     <U+221A> stringr 1.3.1
## <U+221A> readr   1.1.1     <U+221A> forcats 0.3.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
dokarvud=read_csv("http://www.tlu.ee/~jaagup/andmed/keel/korpus/dokarvud.txt")
## Parsed with column specification:
## cols(
##   kood = col_character(),
##   tahti = col_integer(),
##   sonu = col_integer(),
##   lauseid = col_integer(),
##   vigu = col_integer(),
##   veatyype = col_integer(),
##   kolmetahelistepr = col_double(),
##   viietahelistepr = col_double(),
##   kymnejarohkemtahelistepr = col_double(),
##   kahesonalistepr = col_double(),
##   kolmesonalistepr = col_double(),
##   kuuekuni9sonalistepr = col_double(),
##   kymnekuni20sonalistepr = col_double()
## )
cor(dokarvud$tahti, dokarvud$sonu)
## [1] 0.9485238
ggplot(dokarvud, aes(tahti, sonu))+geom_point()

cor(dokarvud$kolmetahelistepr, dokarvud$kymnejarohkemtahelistepr)
## [1] 0.04020447
dokarvud %>% filter(kolmetahelistepr>=10) %>% 
  ggplot(aes(kolmetahelistepr, kymnejarohkemtahelistepr))+geom_point()

dokarvud %>% select(-kood) %>% cor()
##                                tahti         sonu     lauseid        vigu
## tahti                     1.00000000  0.948523752  0.76349587  0.11671231
## sonu                      0.94852375  1.000000000  0.87509753  0.09399618
## lauseid                   0.76349587  0.875097529  1.00000000  0.03921906
## vigu                      0.11671231  0.093996181  0.03921906  1.00000000
## veatyype                  0.06077898  0.034037878 -0.01433122  0.91320970
## kolmetahelistepr          0.26874036  0.285605263  0.22136377  0.13545784
## viietahelistepr          -0.02614740 -0.001154944  0.02417365  0.02773669
## kymnejarohkemtahelistepr  0.34035927  0.299882653  0.23341836  0.07798654
## kahesonalistepr          -0.27392780 -0.241113660 -0.13734791 -0.26149534
## kolmesonalistepr         -0.25697597 -0.230008481 -0.11005180 -0.12357209
## kuuekuni9sonalistepr     -0.10023959 -0.071771981 -0.02069215  0.01784823
## kymnekuni20sonalistepr    0.32848945  0.248616557  0.02539867  0.13799848
##                             veatyype kolmetahelistepr viietahelistepr
## tahti                     0.06077898       0.26874036    -0.026147395
## sonu                      0.03403788       0.28560526    -0.001154944
## lauseid                  -0.01433122       0.22136377     0.024173645
## vigu                      0.91320970       0.13545784     0.027736689
## veatyype                  1.00000000       0.16312916     0.034766664
## kolmetahelistepr          0.16312916       1.00000000     0.146729210
## viietahelistepr           0.03476666       0.14672921     1.000000000
## kymnejarohkemtahelistepr  0.09011467       0.04020447    -0.084268512
## kahesonalistepr          -0.30795975      -0.29144859     0.007988313
## kolmesonalistepr         -0.14088647      -0.17852454     0.061309363
## kuuekuni9sonalistepr      0.02714058       0.08658549     0.202496419
## kymnekuni20sonalistepr    0.16369640       0.31855493     0.029144301
##                          kymnejarohkemtahelistepr kahesonalistepr
## tahti                                  0.34035927    -0.273927797
## sonu                                   0.29988265    -0.241113660
## lauseid                                0.23341836    -0.137347912
## vigu                                   0.07798654    -0.261495337
## veatyype                               0.09011467    -0.307959749
## kolmetahelistepr                       0.04020447    -0.291448594
## viietahelistepr                       -0.08426851     0.007988313
## kymnejarohkemtahelistepr               1.00000000    -0.173145261
## kahesonalistepr                       -0.17314526     1.000000000
## kolmesonalistepr                      -0.16194025     0.307186145
## kuuekuni9sonalistepr                  -0.04886594    -0.011285771
## kymnekuni20sonalistepr                 0.31412171    -0.341191678
##                          kolmesonalistepr kuuekuni9sonalistepr
## tahti                         -0.25697597          -0.10023959
## sonu                          -0.23000848          -0.07177198
## lauseid                       -0.11005180          -0.02069215
## vigu                          -0.12357209           0.01784823
## veatyype                      -0.14088647           0.02714058
## kolmetahelistepr              -0.17852454           0.08658549
## viietahelistepr                0.06130936           0.20249642
## kymnejarohkemtahelistepr      -0.16194025          -0.04886594
## kahesonalistepr                0.30718614          -0.01128577
## kolmesonalistepr               1.00000000          -0.13223122
## kuuekuni9sonalistepr          -0.13223122           1.00000000
## kymnekuni20sonalistepr        -0.39380628          -0.28428212
##                          kymnekuni20sonalistepr
## tahti                                0.32848945
## sonu                                 0.24861656
## lauseid                              0.02539867
## vigu                                 0.13799848
## veatyype                             0.16369640
## kolmetahelistepr                     0.31855493
## viietahelistepr                      0.02914430
## kymnejarohkemtahelistepr             0.31412171
## kahesonalistepr                     -0.34119168
## kolmesonalistepr                    -0.39380628
## kuuekuni9sonalistepr                -0.28428212
## kymnekuni20sonalistepr               1.00000000
dokarvud %>% head(100) %>% select(-kood) %>% pairs()

cor.test(dokarvud$tahti, dokarvud$sonu)
## 
##  Pearson's product-moment correlation
## 
## data:  dokarvud$tahti and dokarvud$sonu
## t = 337.81, df = 12722, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9467517 0.9502383
## sample estimates:
##       cor 
## 0.9485238
#esisada=head(dokarvud, 100)
esisada=sample_n(dokarvud, 100)
cor.test(esisada$tahti, esisada$sonu)
## 
##  Pearson's product-moment correlation
## 
## data:  esisada$tahti and esisada$sonu
## t = 43.183, df = 98, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9625861 0.9829466
## sample estimates:
##       cor 
## 0.9747154
dokarvud %>% ggplot(aes(tahti, sonu)) +
   geom_point()+ geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

dokarvud %>% ggplot(aes(tahti, sonu)) +
  geom_point()+ geom_smooth(method="lm")

lm(dokarvud$tahti~dokarvud$sonu)
## 
## Call:
## lm(formula = dokarvud$tahti ~ dokarvud$sonu)
## 
## Coefficients:
##   (Intercept)  dokarvud$sonu  
##      -161.978          7.241
summary(lm(dokarvud$tahti~dokarvud$sonu))
## 
## Call:
## lm(formula = dokarvud$tahti ~ dokarvud$sonu)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8302.7   -90.6    46.7   140.0  9380.0 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -161.97760    8.69239  -18.63   <2e-16 ***
## dokarvud$sonu    7.24094    0.02144  337.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 722.4 on 12722 degrees of freedom
## Multiple R-squared:  0.8997, Adjusted R-squared:  0.8997 
## F-statistic: 1.141e+05 on 1 and 12722 DF,  p-value: < 2.2e-16
lm(tahti~sonu, data=dokarvud)
## 
## Call:
## lm(formula = tahti ~ sonu, data = dokarvud)
## 
## Coefficients:
## (Intercept)         sonu  
##    -161.978        7.241
predict(lm(tahti~sonu, data=dokarvud), tibble(sonu=c(10, 100, 1000)))
##          1          2          3 
##  -89.56823  562.11608 7078.95917
andmed=read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/00396/Sales_Transactions_Dataset_Weekly.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   Product_Code = col_character(),
##   `Normalized 0` = col_double(),
##   `Normalized 1` = col_double(),
##   `Normalized 2` = col_double(),
##   `Normalized 3` = col_double(),
##   `Normalized 4` = col_double(),
##   `Normalized 5` = col_double(),
##   `Normalized 6` = col_double(),
##   `Normalized 7` = col_double(),
##   `Normalized 8` = col_double(),
##   `Normalized 9` = col_double(),
##   `Normalized 10` = col_double(),
##   `Normalized 11` = col_double(),
##   `Normalized 12` = col_double(),
##   `Normalized 13` = col_double(),
##   `Normalized 14` = col_double(),
##   `Normalized 15` = col_double(),
##   `Normalized 16` = col_double(),
##   `Normalized 17` = col_double(),
##   `Normalized 18` = col_double()
##   # ... with 33 more columns
## )
## See spec(...) for full column specifications.
colnames(andmed)
##   [1] "Product_Code"  "W0"            "W1"            "W2"           
##   [5] "W3"            "W4"            "W5"            "W6"           
##   [9] "W7"            "W8"            "W9"            "W10"          
##  [13] "W11"           "W12"           "W13"           "W14"          
##  [17] "W15"           "W16"           "W17"           "W18"          
##  [21] "W19"           "W20"           "W21"           "W22"          
##  [25] "W23"           "W24"           "W25"           "W26"          
##  [29] "W27"           "W28"           "W29"           "W30"          
##  [33] "W31"           "W32"           "W33"           "W34"          
##  [37] "W35"           "W36"           "W37"           "W38"          
##  [41] "W39"           "W40"           "W41"           "W42"          
##  [45] "W43"           "W44"           "W45"           "W46"          
##  [49] "W47"           "W48"           "W49"           "W50"          
##  [53] "W51"           "MIN"           "MAX"           "Normalized 0" 
##  [57] "Normalized 1"  "Normalized 2"  "Normalized 3"  "Normalized 4" 
##  [61] "Normalized 5"  "Normalized 6"  "Normalized 7"  "Normalized 8" 
##  [65] "Normalized 9"  "Normalized 10" "Normalized 11" "Normalized 12"
##  [69] "Normalized 13" "Normalized 14" "Normalized 15" "Normalized 16"
##  [73] "Normalized 17" "Normalized 18" "Normalized 19" "Normalized 20"
##  [77] "Normalized 21" "Normalized 22" "Normalized 23" "Normalized 24"
##  [81] "Normalized 25" "Normalized 26" "Normalized 27" "Normalized 28"
##  [85] "Normalized 29" "Normalized 30" "Normalized 31" "Normalized 32"
##  [89] "Normalized 33" "Normalized 34" "Normalized 35" "Normalized 36"
##  [93] "Normalized 37" "Normalized 38" "Normalized 39" "Normalized 40"
##  [97] "Normalized 41" "Normalized 42" "Normalized 43" "Normalized 44"
## [101] "Normalized 45" "Normalized 46" "Normalized 47" "Normalized 48"
## [105] "Normalized 49" "Normalized 50" "Normalized 51"
nadalad=andmed %>% select(W0:W51)
nadalad[1, ]-nadalad[3,]
##   W0 W1 W2 W3 W4 W5 W6 W7 W8 W9 W10 W11 W12 W13 W14 W15 W16 W17 W18 W19
## 1  4  1  2 -1  3  4  7  8 -6  8  -3   5  12   2  -3   1   7  -2  -7   5
##   W20 W21 W22 W23 W24 W25 W26 W27 W28 W29 W30 W31 W32 W33 W34 W35 W36 W37
## 1  -2  -2  -4   4  -5   1  -7   5   2  -2  -4   1   5  -2  -2   5   4  -3
##   W38 W39 W40 W41 W42 W43 W44 W45 W46 W47 W48 W49 W50 W51
## 1   0   8   4   6  -2  -7   3   5   5  -5  -7  -2  -3   3
(nadalad[1, ]-nadalad[3,])>0
##        W0   W1   W2    W3   W4   W5   W6   W7    W8   W9   W10  W11  W12
## [1,] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE
##       W13   W14  W15  W16   W17   W18  W19   W20   W21   W22  W23   W24
## [1,] TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
##       W25   W26  W27  W28   W29   W30  W31  W32   W33   W34  W35  W36
## [1,] TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
##        W37   W38  W39  W40  W41   W42   W43  W44  W45  W46   W47   W48
## [1,] FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE
##        W49   W50  W51
## [1,] FALSE FALSE TRUE
names(nadalad)[(nadalad[1, ]-nadalad[3,])>0]
##  [1] "W0"  "W1"  "W2"  "W4"  "W5"  "W6"  "W7"  "W9"  "W11" "W12" "W13"
## [12] "W15" "W16" "W19" "W23" "W25" "W27" "W28" "W31" "W32" "W35" "W36"
## [23] "W39" "W40" "W41" "W44" "W45" "W46" "W51"
nadalad=andmed %>% select(Product_Code, W0:W51)
nadalad %>% filter(Product_Code=="P1" | Product_Code=="P3") %>%  
    select(-Product_Code) %>% t() %>% as_tibble() %>% 
   mutate(nadal=colnames(nadalad)[2:53], vahe=V1-V2) %>%
    filter(vahe>0) %>% .$nadal
##  [1] "W0"  "W1"  "W2"  "W4"  "W5"  "W6"  "W7"  "W9"  "W11" "W12" "W13"
## [12] "W15" "W16" "W19" "W23" "W25" "W27" "W28" "W31" "W32" "W35" "W36"
## [23] "W39" "W40" "W41" "W44" "W45" "W46" "W51"
nadalad=andmed %>% select(W0:W51)
andmed$Product_Code[order(cor(t(nadalad))[,1 ])]
##   [1] "P704" "P735" "P389" "P705" "P374" "P372" "P764" "P340" "P362" "P804"
##  [11] "P789" "P743" "P248" "P358" "P675" "P221" "P117" "P356" "P339" "P685"
##  [21] "P707" "P422" "P347" "P775" "P555" "P490" "P378" "P818" "P800" "P765"
##  [31] "P233" "P752" "P576" "P795" "P478" "P689" "P206" "P721" "P217" "P784"
##  [41] "P793" "P386" "P780" "P405" "P720" "P692" "P429" "P594" "P694" "P534"
##  [51] "P737" "P734" "P455" "P408" "P223" "P409" "P367" "P253" "P382" "P376"
##  [61] "P699" "P600" "P674" "P418" "P291" "P470" "P456" "P741" "P637" "P410"
##  [71] "P258" "P807" "P354" "P683" "P672" "P686" "P402" "P205" "P706" "P616"
##  [81] "P95"  "P733" "P597" "P698" "P262" "P671" "P419" "P263" "P779" "P792"
##  [91] "P589" "P755" "P801" "P503" "P663" "P806" "P745" "P361" "P732" "P373"
## [101] "P697" "P664" "P420" "P771" "P106" "P346" "P650" "P799" "P703" "P278"
## [111] "P601" "P785" "P202" "P100" "P392" "P480" "P238" "P532" "P769" "P444"
## [121] "P652" "P438" "P599" "P797" "P255" "P242" "P757" "P432" "P277" "P729"
## [131] "P413" "P591" "P393" "P384" "P417" "P287" "P232" "P653" "P368" "P265"
## [141] "P359" "P396" "P447" "P170" "P796" "P371" "P606" "P776" "P216" "P398"
## [151] "P638" "P603" "P773" "P250" "P460" "P459" "P604" "P610" "P551" "P461"
## [161] "P6"   "P538" "P611" "P96"  "P237" "P753" "P712" "P568" "P586" "P542"
## [171] "P484" "P596" "P742" "P476" "P252" "P590" "P809" "P274" "P512" "P578"
## [181] "P575" "P768" "P257" "P311" "P595" "P295" "P391" "P646" "P760" "P375"
## [191] "P116" "P608" "P366" "P781" "P751" "P640" "P200" "P270" "P94"  "P436"
## [201] "P273" "P474" "P293" "P330" "P772" "P472" "P449" "P582" "P536" "P395"
## [211] "P394" "P762" "P673" "P749" "P241" "P169" "P211" "P492" "P251" "P816"
## [221] "P269" "P639" "P39"  "P360" "P808" "P324" "P344" "P657" "P592" "P403"
## [231] "P80"  "P312" "P711" "P448" "P260" "P759" "P411" "P502" "P783" "P350"
## [241] "P442" "P243" "P256" "P383" "P247" "P814" "P782" "P349" "P504" "P320"
## [251] "P226" "P246" "P16"  "P224" "P740" "P285" "P533" "P676" "P156" "P805"
## [261] "P497" "P540" "P634" "P426" "P264" "P412" "P819" "P643" "P651" "P607"
## [271] "P580" "P105" "P724" "P195" "P728" "P598" "P272" "P570" "P397" "P577"
## [281] "P234" "P286" "P118" "P13"  "P475" "P261" "P441" "P670" "P351" "P416"
## [291] "P520" "P767" "P514" "P281" "P228" "P230" "P649" "P154" "P556" "P236"
## [301] "P669" "P613" "P377" "P400" "P754" "P526" "P43"  "P585" "P462" "P691"
## [311] "P213" "P778" "P121" "P541" "P380" "P662" "P736" "P627" "P584" "P8"  
## [321] "P33"  "P495" "P296" "P115" "P794" "P443" "P695" "P201" "P343" "P700"
## [331] "P464" "P282" "P326" "P297" "P791" "P189" "P316" "P306" "P731" "P573"
## [341] "P626" "P810" "P177" "P505" "P275" "P58"  "P572" "P817" "P655" "P430"
## [351] "P321" "P623" "P268" "P310" "P301" "P748" "P679" "P802" "P37"  "P73" 
## [361] "P288" "P337" "P207" "P696" "P482" "P702" "P750" "P615" "P249" "P172"
## [371] "P352" "P774" "P583" "P629" "P240" "P208" "P546" "P786" "P587" "P289"
## [381] "P756" "P82"  "P25"  "P680" "P666" "P159" "P74"  "P668" "P53"  "P342"
## [391] "P122" "P440" "P510" "P516" "P813" "P235" "P434" "P529" "P458" "P77" 
## [401] "P465" "P424" "P559" "P328" "P198" "P642" "P744" "P803" "P284" "P421"
## [411] "P428" "P467" "P468" "P644" "P722" "P763" "P27"  "P747" "P563" "P103"
## [421] "P65"  "P336" "P746" "P544" "P537" "P129" "P605" "P136" "P677" "P150"
## [431] "P307" "P283" "P525" "P399" "P239" "P28"  "P245" "P481" "P290" "P687"
## [441] "P44"  "P446" "P633" "P7"   "P770" "P518" "P401" "P47"  "P660" "P787"
## [451] "P714" "P219" "P355" "P715" "P113" "P335" "P445" "P4"   "P515" "P203"
## [461] "P562" "P130" "P209" "P678" "P693" "P654" "P507" "P364" "P423" "P738"
## [471] "P530" "P62"  "P204" "P3"   "P280" "P473" "P719" "P667" "P535" "P628"
## [481] "P303" "P425" "P450" "P488" "P647" "P34"  "P294" "P777" "P107" "P140"
## [491] "P338" "P14"  "P758" "P212" "P454" "P153" "P126" "P619" "P499" "P479"
## [501] "P493" "P690" "P174" "P379" "P766" "P407" "P333" "P227" "P199" "P91" 
## [511] "P137" "P299" "P466" "P661" "P98"  "P279" "P471" "P112" "P487" "P143"
## [521] "P183" "P187" "P111" "P388" "P406" "P548" "P369" "P357" "P489" "P81" 
## [531] "P469" "P659" "P60"  "P477" "P553" "P726" "P739" "P717" "P636" "P108"
## [541] "P593" "P31"  "P370" "P345" "P609" "P483" "P69"  "P220" "P266" "P59" 
## [551] "P494" "P581" "P225" "P78"  "P327" "P500" "P12"  "P713" "P579" "P68" 
## [561] "P57"  "P381" "P656" "P125" "P109" "P558" "P124" "P67"  "P348" "P55" 
## [571] "P545" "P612" "P135" "P414" "P119" "P92"  "P415" "P433" "P665" "P435"
## [581] "P19"  "P457" "P63"  "P139" "P215" "P254" "P259" "P716" "P761" "P29" 
## [591] "P110" "P2"   "P131" "P341" "P521" "P267" "P602" "P528" "P727" "P790"
## [601] "P452" "P304" "P565" "P138" "P363" "P708" "P231" "P427" "P64"  "P182"
## [611] "P132" "P11"  "P164" "P404" "P641" "P45"  "P26"  "P319" "P513" "P574"
## [621] "P524" "P508" "P86"  "P50"  "P811" "P5"   "P114" "P168" "P682" "P176"
## [631] "P620" "P21"  "P23"  "P85"  "P788" "P437" "P517" "P222" "P120" "P491"
## [641] "P72"  "P588" "P178" "P24"  "P309" "P155" "P522" "P99"  "P175" "P52" 
## [651] "P151" "P571" "P35"  "P453" "P625" "P632" "P730" "P148" "P815" "P431"
## [661] "P54"  "P160" "P127" "P614" "P71"  "P439" "P163" "P192" "P315" "P66" 
## [671] "P543" "P331" "P688" "P547" "P147" "P552" "P323" "P539" "P557" "P486"
## [681] "P149" "P509" "P180" "P10"  "P184" "P152" "P185" "P322" "P46"  "P305"
## [691] "P49"  "P812" "P630" "P681" "P97"  "P38"  "P89"  "P292" "P387" "P560"
## [701] "P51"  "P463" "P18"  "P314" "P179" "P41"  "P158" "P519" "P101" "P171"
## [711] "P9"   "P317" "P329" "P229" "P193" "P70"  "P718" "P167" "P554" "P214"
## [721] "P48"  "P564" "P145" "P569" "P188" "P498" "P701" "P550" "P313" "P181"
## [731] "P76"  "P32"  "P190" "P300" "P506" "P128" "P631" "P83"  "P390" "P144"
## [741] "P658" "P194" "P523" "P271" "P210" "P798" "P61"  "P166" "P567" "P93" 
## [751] "P134" "P197" "P622" "P191" "P173" "P501" "P15"  "P298" "P308" "P318"
## [761] "P332" "P511" "P20"  "P141" "P146" "P88"  "P218" "P527" "P186" "P334"
## [771] "P325" "P40"  "P42"  "P365" "P617" "P549" "P162" "P84"  "P22"  "P102"
## [781] "P196" "P123" "P79"  "P17"  "P276" "P90"  "P684" "P244" "P56"  "P161"
## [791] "P165" "P635" "P531" "P566" "P30"  "P618" "P36"  "P142" "P496" "P75" 
## [801] "P561" "P451" "P104" "P157" "P87"  "P485" "P624" "P302" "P133" "P621"
## [811] "P1"