Andmed sisse
andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
set.seed(1234)
andmed <- as.data.frame(lapply(andmed, function(x) x[sample(c(TRUE, NA), prob = c(0.9, 0.1), size = length(x), replace = TRUE)]))
Põhjuse järgi grupeerimine ja keskmise leidmine kahel moel
tapply(andmed$G3, andmed$reason, mean, na.rm=TRUE)
## course home other reputation
## 9.62500 10.68478 11.24000 11.14458
aggregate(andmed$G3, by=list(andmed$reason), FUN=mean, na.rm=TRUE)
## Group.1 x
## 1 course 9.62500
## 2 home 10.68478
## 3 other 11.24000
## 4 reputation 11.14458
Mitme tulba järgi grupeerimine
aggregate(andmed$G3, by=list(andmed$reason, andmed$sex), FUN=mean, na.rm=TRUE)
## Group.1 Group.2 x
## 1 course F 8.957447
## 2 home F 10.050000
## 3 other F 12.769231
## 4 reputation F 10.717391
## 5 course M 10.454545
## 6 home M 11.977273
## 7 other M 10.000000
## 8 reputation M 12.482759
tapply(andmed$G3, list(andmed$reason, andmed$sex), FUN=mean, na.rm=TRUE)
## F M
## course 8.957447 10.45455
## home 10.050000 11.97727
## other 12.769231 10.00000
## reputation 10.717391 12.48276
Loendamine tunnuspaaride kaupa
table(andmed$reason, andmed$sex)
##
## F M
## course 54 63
## home 44 44
## other 13 12
## reputation 51 30
tapply(andmed$G3, list(andmed$reason, andmed$sex), FUN=length)
## F M
## course 54 63
## home 44 44
## other 13 12
## reputation 51 30
aggregate(andmed$G3, list(andmed$reason, andmed$sex), FUN=length)
## Group.1 Group.2 x
## 1 course F 54
## 2 home F 44
## 3 other F 13
## 4 reputation F 51
## 5 course M 63
## 6 home M 44
## 7 other M 12
## 8 reputation M 30
Puuduvate väärtuste välja toomine
table(andmed$reason, andmed$sex, useNA="always")
##
## F M <NA>
## course 54 63 19
## home 44 44 9
## other 13 12 4
## reputation 51 30 8
## <NA> 23 14 7
Reaprotsendid
prop.table(tapply(andmed$G3, list(andmed$reason, andmed$sex), FUN=length), 1)
## F M
## course 0.4615385 0.5384615
## home 0.5000000 0.5000000
## other 0.5200000 0.4800000
## reputation 0.6296296 0.3703704
sapply(andmed, class)
## school sex age address famsize Pstatus
## "factor" "factor" "integer" "factor" "factor" "factor"
## Medu Fedu Mjob Fjob reason guardian
## "integer" "integer" "factor" "factor" "factor" "factor"
## traveltime studytime failures schoolsup famsup paid
## "integer" "integer" "integer" "factor" "factor" "factor"
## activities nursery higher internet romantic famrel
## "factor" "factor" "factor" "factor" "factor" "integer"
## freetime goout Dalc Walc health absences
## "integer" "integer" "integer" "integer" "integer" "integer"
## G1 G2 G3
## "integer" "integer" "integer"
Sama puuduvaid seoseid arvestades
prop.table(table(andmed$reason, andmed$sex, useNA="always"), 1)
##
## F M <NA>
## course 0.39705882 0.46323529 0.13970588
## home 0.45360825 0.45360825 0.09278351
## other 0.44827586 0.41379310 0.13793103
## reputation 0.57303371 0.33707865 0.08988764
## <NA> 0.52272727 0.31818182 0.15909091
Sama eraldi käskudega arvutades
abi=as.data.frame(tapply(andmed$G3, list(andmed$reason, andmed$sex), FUN=length))
abi$kokku=rowSums(abi)
abi$F=abi$F/abi$kokku
abi$M=abi$M/abi$kokku
abi$kokku=NULL
abi
## F M
## course 0.4615385 0.5384615
## home 0.5000000 0.5000000
## other 0.5200000 0.4800000
## reputation 0.6296296 0.3703704
Helbi järgi teeb apply käsklus andmestiku kõigepealt maatriksiks (kus on kõik andmed ühte tüüpi) ning alles siis hakkab sealt pealt midagi arvutama.
apply(andmed, 2, class)
## school sex age address famsize Pstatus
## "character" "character" "character" "character" "character" "character"
## Medu Fedu Mjob Fjob reason guardian
## "character" "character" "character" "character" "character" "character"
## traveltime studytime failures schoolsup famsup paid
## "character" "character" "character" "character" "character" "character"
## activities nursery higher internet romantic famrel
## "character" "character" "character" "character" "character" "character"
## freetime goout Dalc Walc health absences
## "character" "character" "character" "character" "character" "character"
## G1 G2 G3
## "character" "character" "character"
Funktsioon sapply paistab tulpade kaupa toimetama.
sapply(andmed, class)
## school sex age address famsize Pstatus
## "factor" "factor" "integer" "factor" "factor" "factor"
## Medu Fedu Mjob Fjob reason guardian
## "integer" "integer" "factor" "factor" "factor" "factor"
## traveltime studytime failures schoolsup famsup paid
## "integer" "integer" "integer" "factor" "factor" "factor"
## activities nursery higher internet romantic famrel
## "factor" "factor" "factor" "factor" "factor" "integer"
## freetime goout Dalc Walc health absences
## "integer" "integer" "integer" "integer" "integer" "integer"
## G1 G2 G3
## "integer" "integer" "integer"
Täisarvuliste tulpade vahelised korrelatsiooniseosed
cor(andmed[, sapply(andmed, class)=="integer"], use="pairwise.complete.obs")
## age Medu Fedu traveltime
## age 1.000000000 -0.170999347 -0.1374671154 0.103723025
## Medu -0.170999347 1.000000000 0.6059590196 -0.119358465
## Fedu -0.137467115 0.605959020 1.0000000000 -0.124463406
## traveltime 0.103723025 -0.119358465 -0.1244634061 1.000000000
## studytime 0.008913916 0.064399347 -0.0311792347 -0.072394529
## failures 0.218085518 -0.242623528 -0.2489395542 0.026658338
## famrel 0.066064900 -0.038538830 0.0005439266 -0.021052016
## freetime 0.024219231 0.024180673 -0.0104415692 0.003118606
## goout 0.129272441 0.027760804 0.0741534270 0.001543347
## Dalc 0.138055607 0.045468774 -0.0245558881 0.150626429
## Walc 0.111933608 -0.035694867 0.0023927012 0.105411479
## health -0.109681336 0.002085572 -0.0047982034 0.005359396
## absences 0.216133428 0.103249508 0.0477099734 0.033528644
## G1 -0.093000940 0.168511582 0.1660233240 -0.078166755
## G2 -0.155886938 0.193440576 0.1788353411 -0.177215724
## G3 -0.177125537 0.209386411 0.1804063537 -0.105994400
## studytime failures famrel freetime
## age 0.008913916 0.21808552 0.0660648998 0.024219231
## Medu 0.064399347 -0.24262353 -0.0385388303 0.024180673
## Fedu -0.031179235 -0.24893955 0.0005439266 -0.010441569
## traveltime -0.072394529 0.02665834 -0.0210520155 0.003118606
## studytime 1.000000000 -0.19400006 0.0301405684 -0.153998320
## failures -0.194000064 1.00000000 -0.0334099567 0.107280148
## famrel 0.030140568 -0.03340996 1.0000000000 0.143597193
## freetime -0.153998320 0.10728015 0.1435971935 1.000000000
## goout -0.082795566 0.13917574 0.0955985295 0.261033334
## Dalc -0.200847125 0.15316563 -0.0129843358 0.229145730
## Walc -0.223366059 0.14020486 -0.1071959638 0.181197678
## health -0.094705689 0.03491739 0.1143237982 0.066421298
## absences -0.049614820 0.03245605 -0.0555200125 -0.083883191
## G1 0.185901272 -0.32684109 0.0093982637 -0.027543461
## G2 0.156807890 -0.33271215 -0.0477232590 -0.059467549
## G3 0.154662840 -0.36771562 0.0585591968 -0.018546232
## goout Dalc Walc health
## age 0.129272441 0.13805561 0.111933608 -0.109681336
## Medu 0.027760804 0.04546877 -0.035694867 0.002085572
## Fedu 0.074153427 -0.02455589 0.002392701 -0.004798203
## traveltime 0.001543347 0.15062643 0.105411479 0.005359396
## studytime -0.082795566 -0.20084712 -0.223366059 -0.094705689
## failures 0.139175741 0.15316563 0.140204857 0.034917389
## famrel 0.095598530 -0.01298434 -0.107195964 0.114323798
## freetime 0.261033334 0.22914573 0.181197678 0.066421298
## goout 1.000000000 0.24102492 0.401509922 0.020634943
## Dalc 0.241024917 1.00000000 0.639016469 0.064033736
## Walc 0.401509922 0.63901647 1.000000000 0.061752621
## health 0.020634943 0.06403374 0.061752621 1.000000000
## absences 0.043470310 0.11098547 0.144864538 -0.024171090
## G1 -0.163679732 -0.10885350 -0.137373404 -0.036866050
## G2 -0.202536107 -0.07341200 -0.094939468 -0.088890117
## G3 -0.139877566 -0.07131519 -0.071548758 -0.083961364
## absences G1 G2 G3
## age 0.2161334276 -0.093000940 -0.1558869383 -0.17712554
## Medu 0.1032495080 0.168511582 0.1934405759 0.20938641
## Fedu 0.0477099734 0.166023324 0.1788353411 0.18040635
## traveltime 0.0335286444 -0.078166755 -0.1772157238 -0.10599440
## studytime -0.0496148202 0.185901272 0.1568078902 0.15466284
## failures 0.0324560493 -0.326841095 -0.3327121510 -0.36771562
## famrel -0.0555200125 0.009398264 -0.0477232590 0.05855920
## freetime -0.0838831907 -0.027543461 -0.0594675488 -0.01854623
## goout 0.0434703105 -0.163679732 -0.2025361071 -0.13987757
## Dalc 0.1109854688 -0.108853498 -0.0734120016 -0.07131519
## Walc 0.1448645381 -0.137373404 -0.0949394684 -0.07154876
## health -0.0241710897 -0.036866050 -0.0888901167 -0.08396136
## absences 1.0000000000 -0.033866911 0.0001359088 0.04458739
## G1 -0.0338669111 1.000000000 0.8419546857 0.80824586
## G2 0.0001359088 0.841954686 1.0000000000 0.90034167
## G3 0.0445873876 0.808245856 0.9003416708 1.00000000
Seosed graafiliselt, aga sai vist veidi palju korraga. Samuti täisarvulised tulemused üksteise peal.
pairs(andmed[, sapply(andmed, class)=="integer"])
Puuduvate väärtuste loetelu tulpade kaupa
sapply(andmed, function(tulp){sum(is.na(tulp))})
## school sex age address famsize Pstatus
## 44 47 48 32 31 42
## Medu Fedu Mjob Fjob reason guardian
## 42 45 39 45 44 44
## traveltime studytime failures schoolsup famsup paid
## 41 33 40 38 42 40
## activities nursery higher internet romantic famrel
## 30 36 39 29 46 28
## freetime goout Dalc Walc health absences
## 30 37 38 35 47 46
## G1 G2 G3
## 37 38 36
Sama joonisena
puuduvad=sapply(andmed, function(tulp){sum(is.na(tulp))})
barplot(puuduvad, las=3)
Kasvavana järjestatult
barplot(sort(puuduvad), las=3, ylim=c(0, max(puuduvad)+5), ylab="Puuduvate väärtuste arv tunnuses")
Kolme semestri keskmine tulemus lahkumispõhjuse kaupa
tapply((andmed$G1+andmed$G2+andmed$G3)/3, andmed$reason, mean, na.rm=TRUE)
## course home other reputation
## 9.90000 10.86111 10.31667 11.28311
Sama sapply abil.
sapply(unique(andmed$reason), function(tyyp){
tulbad= c("G1", "G2", "G3")
if(!is.na(tyyp)){
arvud=na.omit(andmed[andmed$reason==tyyp, tulbad])
} else {
arvud=na.omit(andmed[is.na(andmed$reason), tulbad])
}
vastus=sum(arvud)/nrow(arvud)/length(tulbad)
names(vastus)=tyyp
vastus
})
## course other home reputation <NA>
## 9.90000 10.31667 10.86111 11.28311 11.10101
Sisseloetavate andmete algvariant.
Käsk sample võtab etteantud loetelust (TRUE, NA) etteantud tõenäosuste (0.9, 0.1) järgi väärtusi, kokku nõnda palju, kui funktsiooni parameetriks tulbas (x) väärtusi on. Parameeter replace=TRUE vajalik selle jaoks, et tohiks sama väärtust korduvalt võtta, muidu poleks kaheliikmelisest sisendist võimalik saada üle kahe tulemuse.
Tulemuseks saadakse TRUE/NA-de rida. Kantsulg-operaatori puhul TRUE puhul tagastatakse x-kogumi vastav väärtus, NA puhul NA, mis siis jääbki tulemusse sisse.
lapply väljastab listi (võtmed ja väärtused, iga tulp eraldi), as.data.frame paneb kõik selle jälle üldisesse ja arusaadavasse tabelisse kokku.
andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
andmed <- as.data.frame(lapply(andmed, function(x) x[sample(c(TRUE, NA), prob = c(0.9, 0.1), size = length(x), replace = TRUE)]))
head(andmed)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP <NA> 18 U GT3 A NA 4 <NA> teacher
## 2 GP F 17 U GT3 T 1 1 at_home other
## 3 <NA> F 15 U <NA> T 1 1 at_home other
## 4 <NA> F NA U GT3 T 4 2 health services
## 5 GP F NA U GT3 T 3 3 other other
## 6 GP M 16 <NA> LE3 T 4 3 <NA> other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 course mother 2 2 0 <NA> no no
## 2 course father 1 NA 0 no yes no
## 3 other mother 1 2 3 yes no yes
## 4 home mother 1 3 0 <NA> yes yes
## 5 home father 1 2 0 no <NA> yes
## 6 reputation mother 1 NA 0 no yes yes
## activities nursery higher internet romantic famrel freetime goout Dalc
## 1 no <NA> yes no no 4 3 NA NA
## 2 no no yes yes no 5 3 3 1
## 3 no yes yes yes no 4 3 NA 2
## 4 yes yes <NA> yes yes 3 NA 2 1
## 5 no yes <NA> <NA> no 4 3 2 1
## 6 yes yes yes yes no 5 4 2 1
## Walc health absences G1 G2 G3
## 1 1 3 6 5 6 6
## 2 1 NA 4 NA 5 6
## 3 3 3 10 NA 8 10
## 4 1 NA 2 15 14 15
## 5 2 5 4 6 10 10
## 6 2 5 NA 15 NA 15
class(andmed)
## [1] "data.frame"
sapply puhul tundub, et jääb eraldi data.frame-ks tegemine ära, see aga tekitab probleeme tekstiliste tulpadega (jääb faktori number)
andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
andmed <- sapply(andmed, function(x) x[sample(c(TRUE, NA), prob = c(0.9, 0.1), size = length(x), replace = TRUE)])
head(andmed)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## [1,] 1 1 18 2 1 1 NA NA 1 5 1
## [2,] NA 1 17 2 1 2 1 1 1 3 1
## [3,] 1 1 15 2 2 2 1 1 1 3 3
## [4,] NA 1 NA 2 NA 2 4 2 2 4 2
## [5,] 1 1 16 2 1 2 3 3 3 3 2
## [6,] NA 2 16 2 2 2 4 3 4 3 NA
## guardian traveltime studytime failures schoolsup famsup paid
## [1,] 2 2 2 0 2 1 1
## [2,] 1 1 2 0 1 2 1
## [3,] 2 1 2 3 2 1 2
## [4,] NA 1 3 0 1 2 2
## [5,] 1 1 2 0 NA 2 2
## [6,] 2 1 2 0 1 2 2
## activities nursery higher internet romantic famrel freetime goout
## [1,] 1 2 2 1 NA 4 3 4
## [2,] 1 1 2 2 NA 5 3 3
## [3,] 1 2 2 2 1 4 3 2
## [4,] 2 2 2 2 2 3 2 2
## [5,] NA 2 2 1 1 4 3 2
## [6,] 2 2 2 2 NA 5 4 2
## Dalc Walc health absences G1 G2 G3
## [1,] 1 1 NA 6 5 6 6
## [2,] 1 1 3 4 5 5 6
## [3,] 2 3 3 10 7 8 10
## [4,] 1 1 5 2 15 14 15
## [5,] NA 2 5 4 6 10 10
## [6,] 1 2 5 10 15 15 15
class(andmed)
## [1] "matrix"
Eraldi käskudena kirjutades saab ka juhuvalikuga võetud kohtadele NAd kirjutada.
andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
andmed <- sapply(andmed, function(x) {
x[round(runif(length(x)/10, 1, length(x)))]=NA
x})
head(andmed)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## [1,] 1 1 18 2 1 NA 4 4 1 5 1
## [2,] 1 1 NA 2 1 2 1 1 1 3 1
## [3,] 1 1 15 2 2 2 1 1 1 3 3
## [4,] 1 1 15 2 1 2 4 2 2 NA 2
## [5,] 1 1 16 NA 1 2 3 3 3 3 2
## [6,] NA 2 16 2 2 2 4 3 4 3 4
## guardian traveltime studytime failures schoolsup famsup paid
## [1,] 2 2 2 0 2 1 1
## [2,] 1 1 2 0 1 2 1
## [3,] 2 1 2 3 2 1 2
## [4,] 2 1 3 0 1 2 NA
## [5,] 1 1 2 0 1 2 2
## [6,] 2 NA 2 0 1 2 2
## activities nursery higher internet romantic famrel freetime goout
## [1,] 1 2 2 1 1 4 3 4
## [2,] 1 1 2 2 1 5 NA 3
## [3,] 1 2 2 2 1 4 3 2
## [4,] 2 2 2 2 2 3 2 2
## [5,] 1 2 NA 1 1 4 3 2
## [6,] 2 2 2 2 1 5 4 2
## Dalc Walc health absences G1 G2 G3
## [1,] 1 1 3 NA 5 6 6
## [2,] 1 1 3 4 5 5 NA
## [3,] 2 3 3 10 NA 8 10
## [4,] NA 1 5 2 15 14 15
## [5,] 1 2 5 4 6 10 10
## [6,] 1 2 5 10 15 15 15
class(andmed)
## [1] "matrix"
Maatriksiks muutmata
andmed=read.table("student-mat.csv", header=TRUE, sep=";", dec=".")
for (tulp in names(andmed)){
andmed[round(runif(nrow(andmed)/10, 1, nrow(andmed))), tulp]=NA
}
head(andmed)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP F 18 U GT3 A 4 4 at_home <NA>
## 2 GP F 17 U GT3 T NA 1 at_home other
## 3 GP F 15 U LE3 T NA NA at_home other
## 4 GP F 15 U GT3 T 4 2 health services
## 5 GP F 16 U GT3 <NA> 3 3 other other
## 6 GP M 16 U <NA> T 4 3 services other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 course mother 2 NA 0 yes no no
## 2 course <NA> 1 NA 0 no yes no
## 3 other mother 1 2 3 yes no yes
## 4 home mother 1 3 0 no yes yes
## 5 home father 1 2 0 <NA> yes yes
## 6 reputation mother 1 2 0 no <NA> yes
## activities nursery higher internet romantic famrel freetime goout Dalc
## 1 no yes yes no <NA> NA 3 4 1
## 2 no no yes yes no 5 3 3 1
## 3 no yes yes yes no 4 3 2 2
## 4 yes yes <NA> yes yes 3 2 2 1
## 5 no yes yes no no 4 3 2 1
## 6 yes yes yes yes no NA 4 2 1
## Walc health absences G1 G2 G3
## 1 1 3 6 5 6 6
## 2 NA 3 NA 5 5 NA
## 3 3 NA 10 7 8 10
## 4 1 NA 2 15 14 15
## 5 2 5 4 6 NA 10
## 6 2 5 10 15 15 15
class(andmed)
## [1] "data.frame"