Andmed sisse

 andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
 set.seed(1234)
 andmed <- as.data.frame(lapply(andmed, function(x) x[sample(c(TRUE, NA), prob = c(0.9, 0.1),    size = length(x), replace = TRUE)]))

Põhjuse järgi grupeerimine ja keskmise leidmine kahel moel

  tapply(andmed$G3, andmed$reason, mean, na.rm=TRUE)
##     course       home      other reputation 
##    9.62500   10.68478   11.24000   11.14458
  aggregate(andmed$G3, by=list(andmed$reason), FUN=mean, na.rm=TRUE)
##      Group.1        x
## 1     course  9.62500
## 2       home 10.68478
## 3      other 11.24000
## 4 reputation 11.14458

Mitme tulba järgi grupeerimine

  aggregate(andmed$G3, by=list(andmed$reason, andmed$sex), FUN=mean, na.rm=TRUE)
##      Group.1 Group.2         x
## 1     course       F  8.957447
## 2       home       F 10.050000
## 3      other       F 12.769231
## 4 reputation       F 10.717391
## 5     course       M 10.454545
## 6       home       M 11.977273
## 7      other       M 10.000000
## 8 reputation       M 12.482759
  tapply(andmed$G3, list(andmed$reason, andmed$sex), FUN=mean, na.rm=TRUE)
##                    F        M
## course      8.957447 10.45455
## home       10.050000 11.97727
## other      12.769231 10.00000
## reputation 10.717391 12.48276

Loendamine tunnuspaaride kaupa

table(andmed$reason, andmed$sex)
##             
##               F  M
##   course     54 63
##   home       44 44
##   other      13 12
##   reputation 51 30
tapply(andmed$G3, list(andmed$reason, andmed$sex), FUN=length)
##             F  M
## course     54 63
## home       44 44
## other      13 12
## reputation 51 30
aggregate(andmed$G3, list(andmed$reason, andmed$sex), FUN=length)
##      Group.1 Group.2  x
## 1     course       F 54
## 2       home       F 44
## 3      other       F 13
## 4 reputation       F 51
## 5     course       M 63
## 6       home       M 44
## 7      other       M 12
## 8 reputation       M 30

Puuduvate väärtuste välja toomine

table(andmed$reason, andmed$sex, useNA="always")
##             
##               F  M <NA>
##   course     54 63   19
##   home       44 44    9
##   other      13 12    4
##   reputation 51 30    8
##   <NA>       23 14    7

Reaprotsendid

prop.table(tapply(andmed$G3, list(andmed$reason, andmed$sex), FUN=length), 1)
##                    F         M
## course     0.4615385 0.5384615
## home       0.5000000 0.5000000
## other      0.5200000 0.4800000
## reputation 0.6296296 0.3703704
sapply(andmed, class)
##     school        sex        age    address    famsize    Pstatus 
##   "factor"   "factor"  "integer"   "factor"   "factor"   "factor" 
##       Medu       Fedu       Mjob       Fjob     reason   guardian 
##  "integer"  "integer"   "factor"   "factor"   "factor"   "factor" 
## traveltime  studytime   failures  schoolsup     famsup       paid 
##  "integer"  "integer"  "integer"   "factor"   "factor"   "factor" 
## activities    nursery     higher   internet   romantic     famrel 
##   "factor"   "factor"   "factor"   "factor"   "factor"  "integer" 
##   freetime      goout       Dalc       Walc     health   absences 
##  "integer"  "integer"  "integer"  "integer"  "integer"  "integer" 
##         G1         G2         G3 
##  "integer"  "integer"  "integer"

Sama puuduvaid seoseid arvestades

prop.table(table(andmed$reason, andmed$sex, useNA="always"), 1)
##             
##                       F          M       <NA>
##   course     0.39705882 0.46323529 0.13970588
##   home       0.45360825 0.45360825 0.09278351
##   other      0.44827586 0.41379310 0.13793103
##   reputation 0.57303371 0.33707865 0.08988764
##   <NA>       0.52272727 0.31818182 0.15909091

Sama eraldi käskudega arvutades

abi=as.data.frame(tapply(andmed$G3, list(andmed$reason, andmed$sex), FUN=length))
abi$kokku=rowSums(abi)
abi$F=abi$F/abi$kokku
abi$M=abi$M/abi$kokku
abi$kokku=NULL
abi
##                    F         M
## course     0.4615385 0.5384615
## home       0.5000000 0.5000000
## other      0.5200000 0.4800000
## reputation 0.6296296 0.3703704

Helbi järgi teeb apply käsklus andmestiku kõigepealt maatriksiks (kus on kõik andmed ühte tüüpi) ning alles siis hakkab sealt pealt midagi arvutama.

apply(andmed, 2, class)
##      school         sex         age     address     famsize     Pstatus 
## "character" "character" "character" "character" "character" "character" 
##        Medu        Fedu        Mjob        Fjob      reason    guardian 
## "character" "character" "character" "character" "character" "character" 
##  traveltime   studytime    failures   schoolsup      famsup        paid 
## "character" "character" "character" "character" "character" "character" 
##  activities     nursery      higher    internet    romantic      famrel 
## "character" "character" "character" "character" "character" "character" 
##    freetime       goout        Dalc        Walc      health    absences 
## "character" "character" "character" "character" "character" "character" 
##          G1          G2          G3 
## "character" "character" "character"

Funktsioon sapply paistab tulpade kaupa toimetama.

sapply(andmed, class)
##     school        sex        age    address    famsize    Pstatus 
##   "factor"   "factor"  "integer"   "factor"   "factor"   "factor" 
##       Medu       Fedu       Mjob       Fjob     reason   guardian 
##  "integer"  "integer"   "factor"   "factor"   "factor"   "factor" 
## traveltime  studytime   failures  schoolsup     famsup       paid 
##  "integer"  "integer"  "integer"   "factor"   "factor"   "factor" 
## activities    nursery     higher   internet   romantic     famrel 
##   "factor"   "factor"   "factor"   "factor"   "factor"  "integer" 
##   freetime      goout       Dalc       Walc     health   absences 
##  "integer"  "integer"  "integer"  "integer"  "integer"  "integer" 
##         G1         G2         G3 
##  "integer"  "integer"  "integer"

Täisarvuliste tulpade vahelised korrelatsiooniseosed

cor(andmed[, sapply(andmed, class)=="integer"], use="pairwise.complete.obs")
##                     age         Medu          Fedu   traveltime
## age         1.000000000 -0.170999347 -0.1374671154  0.103723025
## Medu       -0.170999347  1.000000000  0.6059590196 -0.119358465
## Fedu       -0.137467115  0.605959020  1.0000000000 -0.124463406
## traveltime  0.103723025 -0.119358465 -0.1244634061  1.000000000
## studytime   0.008913916  0.064399347 -0.0311792347 -0.072394529
## failures    0.218085518 -0.242623528 -0.2489395542  0.026658338
## famrel      0.066064900 -0.038538830  0.0005439266 -0.021052016
## freetime    0.024219231  0.024180673 -0.0104415692  0.003118606
## goout       0.129272441  0.027760804  0.0741534270  0.001543347
## Dalc        0.138055607  0.045468774 -0.0245558881  0.150626429
## Walc        0.111933608 -0.035694867  0.0023927012  0.105411479
## health     -0.109681336  0.002085572 -0.0047982034  0.005359396
## absences    0.216133428  0.103249508  0.0477099734  0.033528644
## G1         -0.093000940  0.168511582  0.1660233240 -0.078166755
## G2         -0.155886938  0.193440576  0.1788353411 -0.177215724
## G3         -0.177125537  0.209386411  0.1804063537 -0.105994400
##               studytime    failures        famrel     freetime
## age         0.008913916  0.21808552  0.0660648998  0.024219231
## Medu        0.064399347 -0.24262353 -0.0385388303  0.024180673
## Fedu       -0.031179235 -0.24893955  0.0005439266 -0.010441569
## traveltime -0.072394529  0.02665834 -0.0210520155  0.003118606
## studytime   1.000000000 -0.19400006  0.0301405684 -0.153998320
## failures   -0.194000064  1.00000000 -0.0334099567  0.107280148
## famrel      0.030140568 -0.03340996  1.0000000000  0.143597193
## freetime   -0.153998320  0.10728015  0.1435971935  1.000000000
## goout      -0.082795566  0.13917574  0.0955985295  0.261033334
## Dalc       -0.200847125  0.15316563 -0.0129843358  0.229145730
## Walc       -0.223366059  0.14020486 -0.1071959638  0.181197678
## health     -0.094705689  0.03491739  0.1143237982  0.066421298
## absences   -0.049614820  0.03245605 -0.0555200125 -0.083883191
## G1          0.185901272 -0.32684109  0.0093982637 -0.027543461
## G2          0.156807890 -0.33271215 -0.0477232590 -0.059467549
## G3          0.154662840 -0.36771562  0.0585591968 -0.018546232
##                   goout        Dalc         Walc       health
## age         0.129272441  0.13805561  0.111933608 -0.109681336
## Medu        0.027760804  0.04546877 -0.035694867  0.002085572
## Fedu        0.074153427 -0.02455589  0.002392701 -0.004798203
## traveltime  0.001543347  0.15062643  0.105411479  0.005359396
## studytime  -0.082795566 -0.20084712 -0.223366059 -0.094705689
## failures    0.139175741  0.15316563  0.140204857  0.034917389
## famrel      0.095598530 -0.01298434 -0.107195964  0.114323798
## freetime    0.261033334  0.22914573  0.181197678  0.066421298
## goout       1.000000000  0.24102492  0.401509922  0.020634943
## Dalc        0.241024917  1.00000000  0.639016469  0.064033736
## Walc        0.401509922  0.63901647  1.000000000  0.061752621
## health      0.020634943  0.06403374  0.061752621  1.000000000
## absences    0.043470310  0.11098547  0.144864538 -0.024171090
## G1         -0.163679732 -0.10885350 -0.137373404 -0.036866050
## G2         -0.202536107 -0.07341200 -0.094939468 -0.088890117
## G3         -0.139877566 -0.07131519 -0.071548758 -0.083961364
##                 absences           G1            G2          G3
## age         0.2161334276 -0.093000940 -0.1558869383 -0.17712554
## Medu        0.1032495080  0.168511582  0.1934405759  0.20938641
## Fedu        0.0477099734  0.166023324  0.1788353411  0.18040635
## traveltime  0.0335286444 -0.078166755 -0.1772157238 -0.10599440
## studytime  -0.0496148202  0.185901272  0.1568078902  0.15466284
## failures    0.0324560493 -0.326841095 -0.3327121510 -0.36771562
## famrel     -0.0555200125  0.009398264 -0.0477232590  0.05855920
## freetime   -0.0838831907 -0.027543461 -0.0594675488 -0.01854623
## goout       0.0434703105 -0.163679732 -0.2025361071 -0.13987757
## Dalc        0.1109854688 -0.108853498 -0.0734120016 -0.07131519
## Walc        0.1448645381 -0.137373404 -0.0949394684 -0.07154876
## health     -0.0241710897 -0.036866050 -0.0888901167 -0.08396136
## absences    1.0000000000 -0.033866911  0.0001359088  0.04458739
## G1         -0.0338669111  1.000000000  0.8419546857  0.80824586
## G2          0.0001359088  0.841954686  1.0000000000  0.90034167
## G3          0.0445873876  0.808245856  0.9003416708  1.00000000

Seosed graafiliselt, aga sai vist veidi palju korraga. Samuti täisarvulised tulemused üksteise peal.

pairs(andmed[, sapply(andmed, class)=="integer"])

Puuduvate väärtuste loetelu tulpade kaupa

sapply(andmed, function(tulp){sum(is.na(tulp))})
##     school        sex        age    address    famsize    Pstatus 
##         44         47         48         32         31         42 
##       Medu       Fedu       Mjob       Fjob     reason   guardian 
##         42         45         39         45         44         44 
## traveltime  studytime   failures  schoolsup     famsup       paid 
##         41         33         40         38         42         40 
## activities    nursery     higher   internet   romantic     famrel 
##         30         36         39         29         46         28 
##   freetime      goout       Dalc       Walc     health   absences 
##         30         37         38         35         47         46 
##         G1         G2         G3 
##         37         38         36

Sama joonisena

 puuduvad=sapply(andmed, function(tulp){sum(is.na(tulp))})
 barplot(puuduvad, las=3)

Kasvavana järjestatult

 barplot(sort(puuduvad), las=3, ylim=c(0, max(puuduvad)+5), ylab="Puuduvate väärtuste arv tunnuses")

Kolme semestri keskmine tulemus lahkumispõhjuse kaupa

tapply((andmed$G1+andmed$G2+andmed$G3)/3, andmed$reason, mean, na.rm=TRUE)
##     course       home      other reputation 
##    9.90000   10.86111   10.31667   11.28311

Sama sapply abil.

sapply(unique(andmed$reason), function(tyyp){
  tulbad= c("G1", "G2", "G3")
  if(!is.na(tyyp)){
    arvud=na.omit(andmed[andmed$reason==tyyp, tulbad])
  } else {
    arvud=na.omit(andmed[is.na(andmed$reason), tulbad])
  }
  vastus=sum(arvud)/nrow(arvud)/length(tulbad)
  names(vastus)=tyyp
  vastus
})
##     course      other       home reputation       <NA> 
##    9.90000   10.31667   10.86111   11.28311   11.10101

Sisseloetavate andmete algvariant.

Käsk sample võtab etteantud loetelust (TRUE, NA) etteantud tõenäosuste (0.9, 0.1) järgi väärtusi, kokku nõnda palju, kui funktsiooni parameetriks tulbas (x) väärtusi on. Parameeter replace=TRUE vajalik selle jaoks, et tohiks sama väärtust korduvalt võtta, muidu poleks kaheliikmelisest sisendist võimalik saada üle kahe tulemuse.

Tulemuseks saadakse TRUE/NA-de rida. Kantsulg-operaatori puhul TRUE puhul tagastatakse x-kogumi vastav väärtus, NA puhul NA, mis siis jääbki tulemusse sisse.

lapply väljastab listi (võtmed ja väärtused, iga tulp eraldi), as.data.frame paneb kõik selle jälle üldisesse ja arusaadavasse tabelisse kokku.

 andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
 andmed <- as.data.frame(lapply(andmed, function(x) x[sample(c(TRUE, NA), prob = c(0.9, 0.1),    size = length(x), replace = TRUE)]))
 head(andmed)
##   school  sex age address famsize Pstatus Medu Fedu    Mjob     Fjob
## 1     GP <NA>  18       U     GT3       A   NA    4    <NA>  teacher
## 2     GP    F  17       U     GT3       T    1    1 at_home    other
## 3   <NA>    F  15       U    <NA>       T    1    1 at_home    other
## 4   <NA>    F  NA       U     GT3       T    4    2  health services
## 5     GP    F  NA       U     GT3       T    3    3   other    other
## 6     GP    M  16    <NA>     LE3       T    4    3    <NA>    other
##       reason guardian traveltime studytime failures schoolsup famsup paid
## 1     course   mother          2         2        0      <NA>     no   no
## 2     course   father          1        NA        0        no    yes   no
## 3      other   mother          1         2        3       yes     no  yes
## 4       home   mother          1         3        0      <NA>    yes  yes
## 5       home   father          1         2        0        no   <NA>  yes
## 6 reputation   mother          1        NA        0        no    yes  yes
##   activities nursery higher internet romantic famrel freetime goout Dalc
## 1         no    <NA>    yes       no       no      4        3    NA   NA
## 2         no      no    yes      yes       no      5        3     3    1
## 3         no     yes    yes      yes       no      4        3    NA    2
## 4        yes     yes   <NA>      yes      yes      3       NA     2    1
## 5         no     yes   <NA>     <NA>       no      4        3     2    1
## 6        yes     yes    yes      yes       no      5        4     2    1
##   Walc health absences G1 G2 G3
## 1    1      3        6  5  6  6
## 2    1     NA        4 NA  5  6
## 3    3      3       10 NA  8 10
## 4    1     NA        2 15 14 15
## 5    2      5        4  6 10 10
## 6    2      5       NA 15 NA 15
 class(andmed)
## [1] "data.frame"

sapply puhul tundub, et jääb eraldi data.frame-ks tegemine ära, see aga tekitab probleeme tekstiliste tulpadega (jääb faktori number)

 andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
 andmed <- sapply(andmed, function(x) x[sample(c(TRUE, NA), prob = c(0.9, 0.1),    size = length(x), replace = TRUE)])
head(andmed)
##      school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## [1,]      1   1  18       2       1       1   NA   NA    1    5      1
## [2,]     NA   1  17       2       1       2    1    1    1    3      1
## [3,]      1   1  15       2       2       2    1    1    1    3      3
## [4,]     NA   1  NA       2      NA       2    4    2    2    4      2
## [5,]      1   1  16       2       1       2    3    3    3    3      2
## [6,]     NA   2  16       2       2       2    4    3    4    3     NA
##      guardian traveltime studytime failures schoolsup famsup paid
## [1,]        2          2         2        0         2      1    1
## [2,]        1          1         2        0         1      2    1
## [3,]        2          1         2        3         2      1    2
## [4,]       NA          1         3        0         1      2    2
## [5,]        1          1         2        0        NA      2    2
## [6,]        2          1         2        0         1      2    2
##      activities nursery higher internet romantic famrel freetime goout
## [1,]          1       2      2        1       NA      4        3     4
## [2,]          1       1      2        2       NA      5        3     3
## [3,]          1       2      2        2        1      4        3     2
## [4,]          2       2      2        2        2      3        2     2
## [5,]         NA       2      2        1        1      4        3     2
## [6,]          2       2      2        2       NA      5        4     2
##      Dalc Walc health absences G1 G2 G3
## [1,]    1    1     NA        6  5  6  6
## [2,]    1    1      3        4  5  5  6
## [3,]    2    3      3       10  7  8 10
## [4,]    1    1      5        2 15 14 15
## [5,]   NA    2      5        4  6 10 10
## [6,]    1    2      5       10 15 15 15
class(andmed)
## [1] "matrix"

Eraldi käskudena kirjutades saab ka juhuvalikuga võetud kohtadele NAd kirjutada.

 andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
 andmed <- sapply(andmed, function(x) {
   x[round(runif(length(x)/10, 1, length(x)))]=NA
   x})
head(andmed)
##      school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## [1,]      1   1  18       2       1      NA    4    4    1    5      1
## [2,]      1   1  NA       2       1       2    1    1    1    3      1
## [3,]      1   1  15       2       2       2    1    1    1    3      3
## [4,]      1   1  15       2       1       2    4    2    2   NA      2
## [5,]      1   1  16      NA       1       2    3    3    3    3      2
## [6,]     NA   2  16       2       2       2    4    3    4    3      4
##      guardian traveltime studytime failures schoolsup famsup paid
## [1,]        2          2         2        0         2      1    1
## [2,]        1          1         2        0         1      2    1
## [3,]        2          1         2        3         2      1    2
## [4,]        2          1         3        0         1      2   NA
## [5,]        1          1         2        0         1      2    2
## [6,]        2         NA         2        0         1      2    2
##      activities nursery higher internet romantic famrel freetime goout
## [1,]          1       2      2        1        1      4        3     4
## [2,]          1       1      2        2        1      5       NA     3
## [3,]          1       2      2        2        1      4        3     2
## [4,]          2       2      2        2        2      3        2     2
## [5,]          1       2     NA        1        1      4        3     2
## [6,]          2       2      2        2        1      5        4     2
##      Dalc Walc health absences G1 G2 G3
## [1,]    1    1      3       NA  5  6  6
## [2,]    1    1      3        4  5  5 NA
## [3,]    2    3      3       10 NA  8 10
## [4,]   NA    1      5        2 15 14 15
## [5,]    1    2      5        4  6 10 10
## [6,]    1    2      5       10 15 15 15
class(andmed)
## [1] "matrix"

Maatriksiks muutmata

 andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
 for (tulp in names(andmed)){
   andmed[round(runif(nrow(andmed)/10, 1, nrow(andmed))), tulp]=NA
 }
 head(andmed)
##   school sex age address famsize Pstatus Medu Fedu     Mjob     Fjob
## 1     GP   F  18       U     GT3       A    4    4  at_home     <NA>
## 2     GP   F  17       U     GT3       T   NA    1  at_home    other
## 3     GP   F  15       U     LE3       T   NA   NA  at_home    other
## 4     GP   F  15       U     GT3       T    4    2   health services
## 5     GP   F  16       U     GT3    <NA>    3    3    other    other
## 6     GP   M  16       U    <NA>       T    4    3 services    other
##       reason guardian traveltime studytime failures schoolsup famsup paid
## 1     course   mother          2        NA        0       yes     no   no
## 2     course     <NA>          1        NA        0        no    yes   no
## 3      other   mother          1         2        3       yes     no  yes
## 4       home   mother          1         3        0        no    yes  yes
## 5       home   father          1         2        0      <NA>    yes  yes
## 6 reputation   mother          1         2        0        no   <NA>  yes
##   activities nursery higher internet romantic famrel freetime goout Dalc
## 1         no     yes    yes       no     <NA>     NA        3     4    1
## 2         no      no    yes      yes       no      5        3     3    1
## 3         no     yes    yes      yes       no      4        3     2    2
## 4        yes     yes   <NA>      yes      yes      3        2     2    1
## 5         no     yes    yes       no       no      4        3     2    1
## 6        yes     yes    yes      yes       no     NA        4     2    1
##   Walc health absences G1 G2 G3
## 1    1      3        6  5  6  6
## 2   NA      3       NA  5  5 NA
## 3    3     NA       10  7  8 10
## 4    1     NA        2 15 14 15
## 5    2      5        4  6 NA 10
## 6    2      5       10 15 15 15
 class(andmed)
## [1] "data.frame"