Skype kontaktide analüüs

Draiver sisse

library("RSQLite")
## Warning: package 'RSQLite' was built under R version 3.2.5
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.2.5
drv=dbDriver("SQLite")

Konkreetse baasi külge

Faili nimeks paistis olema lihtsalt main.db, mitte main-SkypeR.db

db=dbConnect(drv, "C:/Users/Jaagup Kippar/AppData/Roaming/Skype/jaagup.kippar/main.db")

Tabelite loetelu baasis

dbListTables(db)
##  [1] "Accounts"           "Alerts"             "AppSchemaVersion"  
##  [4] "CallHandlers"       "CallMembers"        "Calls"             
##  [7] "ChatMembers"        "Chats"              "ContactGroups"     
## [10] "Contacts"           "Conversations"      "DbMeta"            
## [13] "LegacyMessages"     "MediaDocuments"     "MessageAnnotations"
## [16] "Messages"           "Participants"       "SMSes"             
## [19] "Transfers"          "Translators"        "VideoMessages"     
## [22] "Videos"             "Voicemails"         "tracker_journal"

Tulpade loetelu tabelis

dbListFields(db, "Contacts")
##   [1] "id"                             "is_permanent"                  
##   [3] "type"                           "skypename"                     
##   [5] "pstnnumber"                     "aliases"                       
##   [7] "fullname"                       "birthday"                      
##   [9] "gender"                         "languages"                     
##  [11] "country"                        "province"                      
##  [13] "city"                           "phone_home"                    
##  [15] "phone_office"                   "phone_mobile"                  
##  [17] "emails"                         "hashed_emails"                 
##  [19] "homepage"                       "about"                         
##  [21] "avatar_image"                   "mood_text"                     
##  [23] "rich_mood_text"                 "timezone"                      
##  [25] "capabilities"                   "profile_timestamp"             
##  [27] "nrof_authed_buddies"            "ipcountry"                     
##  [29] "avatar_timestamp"               "mood_timestamp"                
##  [31] "received_authrequest"           "authreq_timestamp"             
##  [33] "lastonline_timestamp"           "availability"                  
##  [35] "displayname"                    "firstname"                     
##  [37] "lastname"                       "refreshing"                    
##  [39] "given_authlevel"                "given_displayname"             
##  [41] "assigned_speeddial"             "assigned_comment"              
##  [43] "alertstring"                    "lastused_timestamp"            
##  [45] "authrequest_count"              "assigned_phone1"               
##  [47] "assigned_phone1_label"          "assigned_phone2"               
##  [49] "assigned_phone2_label"          "assigned_phone3"               
##  [51] "assigned_phone3_label"          "is_mobile"                     
##  [53] "is_trusted"                     "buddystatus"                   
##  [55] "isauthorized"                   "popularity_ord"                
##  [57] "external_id"                    "external_system_id"            
##  [59] "liveid_cid"                     "isblocked"                     
##  [61] "authorization_certificate"      "certificate_send_count"        
##  [63] "account_modification_serial_nr" "saved_directory_blob"          
##  [65] "nr_of_buddies"                  "server_synced"                 
##  [67] "contactlist_track"              "last_used_networktime"         
##  [69] "network_availability"           "authorized_time"               
##  [71] "sent_authrequest"               "sent_authrequest_time"         
##  [73] "sent_authrequest_serial"        "buddyblob"                     
##  [75] "cbl_future"                     "node_capabilities"             
##  [77] "revoked_auth"                   "added_in_shared_group"         
##  [79] "in_shared_group"                "authreq_history"               
##  [81] "profile_attachments"            "stack_version"                 
##  [83] "offline_authreq_id"             "node_capabilities_and"         
##  [85] "authreq_crc"                    "authreq_src"                   
##  [87] "pop_score"                      "authreq_nodeinfo"              
##  [89] "main_phone"                     "unified_servants"              
##  [91] "phone_home_normalized"          "phone_office_normalized"       
##  [93] "phone_mobile_normalized"        "sent_authrequest_initmethod"   
##  [95] "authreq_initmethod"             "verified_email"                
##  [97] "verified_company"               "sent_authrequest_extrasbitmask"
##  [99] "is_auto_buddy"                  "group_membership"              
## [101] "avatar_url"                     "avatar_url_new"                
## [103] "extprop_seen_birthday"          "extprop_sms_target"            
## [105] "extprop_external_data"

Andmed SELECT-päringuga sisse

Ligi pooltel oli buddystatus NA, testkontol ja ühel isikul 2 ning mõneteistkümnel siis 3. Milline staatus millisele numbrile vastab, ei suutnud esimese hooga tuvastada.

andmed=dbGetQuery(db, "SELECT displayname, skypename, languages, birthday, gender, buddystatus FROM Contacts where buddystatus=3")
andmed
##           displayname               skypename languages birthday gender
## 1  anne.mari.ernesaks      anne.mari.ernesaks      <NA>       NA     NA
## 2         Pille Eslon             pille.eslon      <NA>       NA     NA
## 3             KƤtlin                k2tsukas        et 19890217      2
## 4         ruizcalleja             ruizcalleja      <NA>       NA     NA
## 5       Peeter Marvet               petskratt        et       NA     NA
## 6         Antero Laas              anterolaas      <NA>       NA     NA
## 7           Aino Kiis               aino.kiis      <NA>       NA     NA
## 8        Ruth Randoja            ruth.randoja      <NA>       NA     NA
## 9        Siim Sildver                 ziim999        en 19911022      1
## 10    Kristo Tammeoja          kristotammeoja      <NA>       NA     NA
## 11      Triinu Raigna           triinu_raigna      <NA>       NA     NA
## 12         Vaiko Noor              vaiko.noor        et 19910227      1
## 13    Mari-Liis Peets          mari.liis.lokk      <NA>       NA     NA
## 14     Marianne Adson facebook:marianne.adson      <NA> 19810311      2
## 15      Toomas HƤide            toomas.haide      <NA>       NA     NA
## 16     Ulrika Tuppits         ulrika.danziger      <NA> 19810423      2
## 17        Annika Tuul 1:annika.tuul@gmail.com      <NA>       NA     NA
##    buddystatus
## 1            3
## 2            3
## 3            3
## 4            3
## 5            3
## 6            3
## 7            3
## 8            3
## 9            3
## 10           3
## 11           3
## 12           3
## 13           3
## 14           3
## 15           3
## 16           3
## 17           3

Märgitud kasutajakeelte jagunemine sisseloetud andmete hulgas

table(andmed$languages, useNA="always")
## 
##   en   et <NA> 
##    1    3   13

Sünniaja klass loeti kõigepealt sisse arvulisena

class(andmed[!is.na(andmed$birthday), "birthday"])
## [1] "integer"

Kuupäeva saab sobivale kujule määrata vaid nendel, kel see olemas. Kuna olemasoleva kuupäevaga inimesi läheb rohkemgi vaja, siis jätan nende järjekorranumbrid eraldi loetellu.

olemasnr=which(!is.na(andmed$birthday))
olemasnr
## [1]  3  9 12 14 16

Kuupäevatulp kuupäeva tüüpi - neil, kel olemas. Andmete lugemine etteantud vormingu kaudu. Kõigepalt tähtedeks ja siis sealt Date-ks.

andmed[olemasnr, "birthday"]=as.Date(as.character(andmed[olemasnr, "birthday"]), "%Y%m%d")
andmed
##           displayname               skypename languages birthday gender
## 1  anne.mari.ernesaks      anne.mari.ernesaks      <NA>       NA     NA
## 2         Pille Eslon             pille.eslon      <NA>       NA     NA
## 3             KƤtlin                k2tsukas        et     6987      2
## 4         ruizcalleja             ruizcalleja      <NA>       NA     NA
## 5       Peeter Marvet               petskratt        et       NA     NA
## 6         Antero Laas              anterolaas      <NA>       NA     NA
## 7           Aino Kiis               aino.kiis      <NA>       NA     NA
## 8        Ruth Randoja            ruth.randoja      <NA>       NA     NA
## 9        Siim Sildver                 ziim999        en     7964      1
## 10    Kristo Tammeoja          kristotammeoja      <NA>       NA     NA
## 11      Triinu Raigna           triinu_raigna      <NA>       NA     NA
## 12         Vaiko Noor              vaiko.noor        et     7727      1
## 13    Mari-Liis Peets          mari.liis.lokk      <NA>       NA     NA
## 14     Marianne Adson facebook:marianne.adson      <NA>     4087      2
## 15      Toomas HƤide            toomas.haide      <NA>       NA     NA
## 16     Ulrika Tuppits         ulrika.danziger      <NA>     4130      2
## 17        Annika Tuul 1:annika.tuul@gmail.com      <NA>       NA     NA
##    buddystatus
## 1            3
## 2            3
## 3            3
## 4            3
## 5            3
## 6            3
## 7            3
## 8            3
## 9            3
## 10           3
## 11           3
## 12           3
## 13           3
## 14           3
## 15           3
## 16           3
## 17           3

Klass endiselt muutmata

class(andmed$birthday)
## [1] "numeric"

Kirjutan lihtsalt üle

class(andmed$birthday)="Date"
class(andmed$birthday)
## [1] "Date"

Noorimat kontakti on mõtet leida ainult nende hulgast, kel sünnikuupäev olemas. Kuna muidu kippusid avaldised pikaks minema, siis tõstsin kuupäevaga inimesed eraldi muutujasse.

synnipaevagaandmed=andmed[olemasnr, ]

Sünnikuupäevaga inimesed järjestatuna kahanevalt sünnikuupäeva järgi

synnipaevagaandmed[rev(order(synnipaevagaandmed$birthday)),]
##       displayname               skypename languages   birthday gender
## 9    Siim Sildver                 ziim999        en 1991-10-22      1
## 12     Vaiko Noor              vaiko.noor        et 1991-02-27      1
## 3         KƤtlin                k2tsukas        et 1989-02-17      2
## 16 Ulrika Tuppits         ulrika.danziger      <NA> 1981-04-23      2
## 14 Marianne Adson facebook:marianne.adson      <NA> 1981-03-11      2
##    buddystatus
## 9            3
## 12           3
## 3            3
## 16           3
## 14           3

Neist esimese nimi

synnipaevagaandmed[rev(order(synnipaevagaandmed$birthday)),"displayname"][[1]]
## [1] "Siim Sildver"

Sugu faktoriks (enne oli täisarv)

andmed$gender=factor(andmed$gender, exclude=NULL, levels=c(1, 2, NA), labels=c("mees", "naine", "teadmata"))
andmed$gender
##  [1] teadmata teadmata naine    teadmata teadmata teadmata teadmata
##  [8] teadmata mees     teadmata teadmata mees     teadmata naine   
## [15] teadmata naine    teadmata
## Levels: mees naine teadmata

Sagedusjaotus

table(andmed$gender)
## 
##     mees    naine teadmata 
##        2        3       12

dplyr-i katsetus

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
db=src_sqlite("C:/Users/Jaagup Kippar/AppData/Roaming/Skype/jaagup.kippar/main.db")

Tabeli eraldamine

kontaktid=tbl(db, "Contacts")
kontaktid
## Source: sqlite 3.8.6 [C:/Users/Jaagup Kippar/AppData/Roaming/Skype/jaagup.kippar/main.db]
## From: Contacts [43 x 105]
## 
##       id is_permanent  type          skypename pstnnumber aliases
##    (int)        (int) (int)              (chr)      (chr)   (chr)
## 1     23            1     1            echo123         NA      NA
## 2    703            1     1 anne.mari.ernesaks         NA      NA
## 3    707            1     1        pille.eslon         NA      NA
## 4    711            1     1           k2tsukas         NA      NA
## 5    715            1     1        ruizcalleja         NA      NA
## 6    719            1     1          petskratt         NA      NA
## 7    723            1     1         raido.rokk         NA      NA
## 8    727            1     1         anterolaas         NA      NA
## 9    731            1     1          aino.kiis         NA      NA
## 10   735            1     1       ruth.randoja         NA      NA
## ..   ...          ...   ...                ...        ...     ...
## Variables not shown: fullname (chr), birthday (int), gender (int),
##   languages (chr), country (chr), province (chr), city (chr), phone_home
##   (chr), phone_office (chr), phone_mobile (chr), emails (chr),
##   hashed_emails (chr), homepage (chr), about (chr), avatar_image (chr),
##   mood_text (chr), rich_mood_text (chr), timezone (int), capabilities
##   (chr), profile_timestamp (int), nrof_authed_buddies (int), ipcountry
##   (chr), avatar_timestamp (int), mood_timestamp (int),
##   received_authrequest (chr), authreq_timestamp (int),
##   lastonline_timestamp (int), availability (int), displayname (chr),
##   firstname (chr), lastname (chr), refreshing (int), given_authlevel
##   (int), given_displayname (chr), assigned_speeddial (chr),
##   assigned_comment (chr), alertstring (chr), lastused_timestamp (int),
##   authrequest_count (int), assigned_phone1 (chr), assigned_phone1_label
##   (chr), assigned_phone2 (chr), assigned_phone2_label (chr),
##   assigned_phone3 (chr), assigned_phone3_label (chr), is_mobile (int),
##   is_trusted (int), buddystatus (int), isauthorized (int), popularity_ord
##   (int), external_id (chr), external_system_id (chr), liveid_cid (chr),
##   isblocked (int), authorization_certificate (chr), certificate_send_count
##   (int), account_modification_serial_nr (int), saved_directory_blob (chr),
##   nr_of_buddies (int), server_synced (int), contactlist_track (int),
##   last_used_networktime (int), network_availability (int), authorized_time
##   (int), sent_authrequest (chr), sent_authrequest_time (int),
##   sent_authrequest_serial (int), buddyblob (chr), cbl_future (chr),
##   node_capabilities (int), revoked_auth (int), added_in_shared_group
##   (int), in_shared_group (int), authreq_history (chr), profile_attachments
##   (chr), stack_version (int), offline_authreq_id (int),
##   node_capabilities_and (int), authreq_crc (int), authreq_src (int),
##   pop_score (int), authreq_nodeinfo (chr), main_phone (chr),
##   unified_servants (chr), phone_home_normalized (chr),
##   phone_office_normalized (chr), phone_mobile_normalized (chr),
##   sent_authrequest_initmethod (int), authreq_initmethod (int),
##   verified_email (chr), verified_company (chr),
##   sent_authrequest_extrasbitmask (int), is_auto_buddy (int),
##   group_membership (int), avatar_url (chr), avatar_url_new (chr),
##   extprop_seen_birthday (int), extprop_sms_target (int),
##   extprop_external_data (chr)

Kasutajanimed

kontaktid %>%  filter(buddystatus==3) %>% select(skypename)
## Source: sqlite 3.8.6 [C:/Users/Jaagup Kippar/AppData/Roaming/Skype/jaagup.kippar/main.db]
## From: Contacts [17 x 1]
## Filter: buddystatus == 3 
## 
##                  skypename
##                      (chr)
## 1       anne.mari.ernesaks
## 2              pille.eslon
## 3                 k2tsukas
## 4              ruizcalleja
## 5                petskratt
## 6               anterolaas
## 7                aino.kiis
## 8             ruth.randoja
## 9                  ziim999
## 10          kristotammeoja
## 11           triinu_raigna
## 12              vaiko.noor
## 13          mari.liis.lokk
## 14 facebook:marianne.adson
## 15            toomas.haide
## 16         ulrika.danziger
## 17 1:annika.tuul@gmail.com

Keelte jaotus. Ungroup ei ole vist niisama vaatamiseks kohustuslik, aga kasulik hilisema töötluse juures.

kontaktid %>%  filter(buddystatus==3)  %>%  group_by(languages)  %>%  summarize(n=count())  %>% ungroup()
## Source: sqlite 3.8.6 [C:/Users/Jaagup Kippar/AppData/Roaming/Skype/jaagup.kippar/main.db]
## From: <derived table> [?? x 2]
## 
##    languages     n
##        (chr) (int)
## 1         NA    13
## 2         en     1
## 3         et     3
## ..       ...   ...