# Glim_in_R # 3. Yleistetyt lineaariset mallit R:ssä # 3.1 Esimerkkiaineisto ######################### kyphosis <- read.table("C:\\Kurssit\\Glim\\Glim02\\Datat\\kyphosis.txt", header = TRUE, sep = ",") attach(kyphosis) names(kyphosis) # [1] "Kyphosis" "Age" "Number" "Start" ########## # Aineisto on S-Plus-kirjastosta ########## # Muuttujien kuvaukset: -------- Kyphosis -------- Binäärinen muuttuja, jonka arvo (present/absent) kertoo, onko potilaalla leikkauksen jälkeen tietty selän epämuodostuma (kyphosis, kyttyräselkäisyys?) _________ Age --------- Lapsen ikä kuukausina --------- Number --------- Operaation kohteena olleiden selkänikamien lukumäärä --------- Start --------- The beginning of the range of the vertebrae involved in the operation ################################### par(mfrow=c(1,3)) # par(mfrow=c(1,3), cex=0.7) boxplot(Age ~ Kyphosis,ylab="Age") boxplot(Number ~ Kyphosis,ylab="Number") boxplot(Start ~ Kyphosis,ylab="Start") # 3.2 R:n funktiot ja oliot # 3.2.1 Mallin sovitus names(kyphosis) kyphosis[1:10,]; kyphosis[1:10,2]; kyphosis[c(1,5,81),c(1,4)] kyph.glm1 <- glm(Kyphosis ~ Age + Number + Start, family=binomial,data=kyphosis) kyph.glm1 summary(kyph.glm1) names(kyph.glm1) kyph.glm1$call # glm(formula = Kyphosis ~ Age + Number + Start, family = binomial, # data = kyphosis) # Lyhyempiä vaihtoehtoja joihinkin tilanteisiin glm(Kyphosis ~ ., binomial, kyphosis) # piste (.) tarkoittaa, että kaikkia käytetään # selittäjänä glm(Kyphosis =="present" ~ .,binomial,kyphosis) # Annetaan tapahtuma, jonka # todennäköisyyttä selitetään # glm(Kyphosis =="absent" ~ .,binomial,kyphosis) [1] "coefficients" "residuals" "fitted.values" [4] "effects" "R" "rank" [7] "qr" "family" "linear.predictors" [10] "deviance" "aic" "null.deviance" [13] "iter" "weights" "prior.weights" [16] "df.residual" "df.null" "y" [19] "converged" "boundary" "model" [22] "call" "formula" "terms" [25] "data" "offset" "control" [28] "method" "contrasts" "xlevels" kyph.glm1$coefficients; coefficients(kyph.glm1) kyph.glm1$linear.predictors[1:5] kyph.glm1$fitted.values[1:5] predict(kyph.glm1)[1:5] # Antaa lineaarisen prdiktorin arvot p <- exp(predict(kyph.glm1)[1:5]) pr <- p/(1+p) pr 1 2 3 4 5 0.2570153 0.1224749 0.4929995 0.4579668 0.0298565 predict(kyph.glm1, type= "response")[1:5] # Antaa todennäköisyydet # summary ######################### summary(kyph.glm1,cor=T) Call: glm(formula = Kyphosis ~ Age + Number + Start, family = binomial, data = kyphosis) Deviance Residuals: Min 1Q Median 3Q Max -2.3123 -0.5484 -0.3632 -0.1659 2.1613 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -2.036708 1.443836 -1.411 0.15836 Age 0.010929 0.006416 1.703 0.08849 . Number 0.410564 0.223772 1.835 0.06654 . Start -0.206501 0.067498 -3.059 0.00222 ** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 83.234 on 80 degrees of freedom Residual deviance: 61.380 on 77 degrees of freedom AIC: 69.38 Number of Fisher Scoring iterations: 4 Correlation of Coefficients: (Intercept) Age Number Age -0.4608 Number -0.8473 0.2284 Start -0.3836 -0.2820 0.1148 ####################################### # Residuaalit ############# residuals(kyph.glm1)[1:5] kyph.glm1$residuals[1:5] residuals(kyph.glm1,type="deviance")[1:5] residuals(kyph.glm1,type="working")[1:5] residuals(kyph.glm1,type="pearson")[1:5] residuals(kyph.glm1,type="response")[1:5] residuals(kyph.glm1,type="response")[1:5] # 1 2 3 4 5 # -0.2570153 -0.1224749 0.5070005 -0.4579668 -0.0298565 kyph.glm1$fitted.values[1:5] # 1 2 3 4 5 # 0.2570153 0.1224749 0.4929995 0.4579668 0.0298565 kyphosis[1:5,1] # [1] absent absent present absent absent # Levels: absent present # present=1 # Linkkifunktion ja varianssifunktion määrittely fam1 <- binomial() fam1 # Family: binomial # Link function: logit # names(fam1) # [1] "family" "link" "linkfun" "linkinv" "variance" # [6] "dev.resids" "aic" "mu.eta" "initialize" "validmu" # [11] "valideta" fam1$link # [1] "logit" fam1$variance # function (mu) # mu * (1 - mu) eta <- seq(-5,5,200) plot(range(eta), c(0,1), xlab="eta",ylab="mu",type="n") eta <- seq(-5,5,length=200) y<-exp(eta)/(1+exp(eta)) plot(range(eta), c(0,1), xlab="eta",ylab="mu",type="n") lines(eta,y) library(boot) eta <- seq(-5,5,length=200) plot(range(eta), c(0,1), xlab="eta",ylab="mu",type="n") lines(eta,inv.logit(eta)) # 3.2.3 Mallin päivittäminen ############################ kyph.glm2 <- update(kyph.glm1, ~ . -Age) # Muuttuja Age poistetaan mallista kyph.glm1 kyph.glm2 update(kyph.glm1, subset = -79) # Havainto 79 poistetaan aineistosta update(kyph.glm1, subset = 1:50) # Havainnot 1:50 mukana update(kyph.glm1, subset = ) # 3.2.4 Devianssianalyysin taulukot ################################### kyph.glm3 <- update(kyph.glm2, ~ . -Number) anova(kyph.glm1) anova(kyph.glm1,kyph.glm2,kyph.glm3) anova(kyph.glm1) ############ Analysis of Deviance Table Model: binomial, link: logit Response: Kyphosis Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev NULL 80 83.234 Age 1 1.302 79 81.932 Number 1 10.306 78 71.627 Start 1 10.247 77 61.380 anova(kyph.glm1,kyph.glm2,kyph.glm3) ################## Analysis of Deviance Table Model 1: Kyphosis ~ Age + Number + Start Model 2: Kyphosis ~ Number + Start Model 3: Kyphosis ~ Start Resid. Df Resid. Dev Df Deviance 1 77 61.380 2 78 64.536 -1 -3.157 3 79 68.072 -1 -3.536 ################## kyph.anodev<- anova(kyph.glm1,kyph.glm2,kyph.glm3) kyph.anodev[,-1] kyph.anodev[1,] kyph.anodev$Deviance ########## # Testi-optio ############# anova(kyph.glm1, test = "Chisq") Analysis of Deviance Table Model: binomial, link: logit Response: Kyphosis Terms added sequentially (first to last) Df Deviance Resid. Df Resid. Dev P(>|Chi|) NULL 80 83.234 Age 1 1.302 79 81.932 0.254 Number 1 10.306 78 71.627 0.001 Start 1 10.247 77 61.380 0.001 anova1<- anova(kyph.glm1) stat.anova(anova1, test = "Chisq") # 3.2.5 Plottaus ################ # Osittaisrediduaali predict(kyph.glm1, type= "terms")+kyph.glm1$residuals predict(kyph.glm1, type= "terms")[1:3,1]+kyph.glm1$residuals[1] pres<- predict(kyph.glm1, type= "terms")+kyph.glm1$residuals par(mfrow=c(2,2)) #par(mfrow=c(1,3), cex=0.7) plot(Age,pres[,1]); plot(Number,pres[,1]); plot(Start,pres[,1]) # Kokeillaan esim. mallia kyph.glm4 <- glm(Kyphosis ~ poly(Age,2) + I((Start>12)*(Start-12)), family=binomial,data=kyphosis) kyph.glm4 ######### Call: glm(formula = Kyphosis ~ poly(Age, 2) + I((Start > 12) * (Start - 12)), family = binomial, data = kyphosis) Coefficients: (Intercept) poly(Age, 2)1 -0.685 5.772 poly(Age, 2)2 I((Start > 12) * (Start - 12)) -10.325 -1.351 Degrees of Freedom: 80 Total (i.e. Null); 77 Residual Null Deviance: 83.23 Residual Deviance: 51.95 AIC: 59.95 ######### predict(kyph.glm4, type= "terms", se =T) # 3.3 Laajennuksia # 3.3.1 Muita glm():n argumentteja # offset smoking.cdeath <- read.table("C:\\Kurssit\\Glim\\Glim02\\Datat\\smoking.cdeath.txt", header = TRUE, sep = ",") # smoking.cdeath names(smoking.cdeath) [1] "age" "agec" "agecsq" "smokeage" "smoking" [6] "deaths" "person.years" smoking.cd <- glm(deaths ~ offset(log(person.years)) + smoke + agec + agecsq + smokeage, family = poisson, data = smoking.cdeath) # smoking.cd ########## Call: glm(formula = deaths ~ offset(log(person.years)) + smoke + agec + agecsq + smokeage, family = poisson, data = smoking.cdeath) Coefficients: (Intercept) smoke agec agecsq smokeage -10.7918 1.4410 2.3765 -0.1977 -0.3075 Degrees of Freedom: 9 Total (i.e. Null); 5 Residual Null Deviance: 935.1 Residual Deviance: 1.635 AIC: 66.7 ########## # summary(smoking.cd) glm.control() smoking.cd <- glm(deaths ~ offset(log(person.years)) + smoke + agec + agecsq + smokeage, family = poisson, data = smoking.cdeath,trace=T) # 3.3.2 Diagnostiikkaa cooks.distance(kyph.glm4) plot(cooks.distance(kyph.glm4)) identify(cooks.distance(kyph.glm4)) dfbetas(kyph.glm4); plot(dfbetas(kyph.glm4)) b1<- dfbetas(kyph.glm4)[,2] plot(b1) covratio(kyph.glm4); plot(covratio(kyph.glm4));identify(covratio(kyph.glm4)) # 3.3.3 Askeltava mallinvalinta kyph.glm5 <- glm(Kyphosis ~ poly(Age,2) + I((Start>12)*(Start-12)) + poly(Number,3), family=binomial,data=kyphosis) kyph.step <- step(kyph.glm5) kyph.step$anova library(MASS) # stepAIC(object, scope, scale, direction=c("both", "backward", "forward"), # trace=1, keep=NULL, steps=1000, use.start=FALSE, k=2, ...) #birthwt.step2 <- stepAIC(birthwt.glm, ~ .^2 + I(scale(age)^2) # + I(scale(lwt)^2), trace=FALSE) kyph.sAIC<- stepAIC(kyph.glm1, ~ .^2 + poly(Age,2) + I((Start>12)*(Start-12))) # addterm; dropterm; step #house.glm1 <- update(house.glm0, . ~ . + Sat*(Infl+Type+Cont)) #addterm(house.glm1, ~. + Sat:(Infl+Type+Cont)^2, test = "Chisq") #dropterm(object, scope, scale = 0, test=c("none", "Chisq", "F"), # k = 2, sorted = FALSE, trace = FALSE, ...) kyph.add1<- addterm(kyph.glm1, ~ .^2, test= "Chisq") kyph.glm6 <- update(kyph.glm1, ~ .^2) kyph.drop1<- dropterm(kyph.glm6, test= "Chisq") # 3.3.4 Ennustaminen kyph.glm4 <- glm(Kyphosis ~ poly(Age,2) + I((Start>12)*(Start-12)), family=binomial,data=kyphosis) fitted(kyph.glm4)[1:5] predict(kyph.glm4)[1:5] predict.glm(kyph.glm4, type="response")[1:5] predict(kyph.glm4, type="terms")[1:5,] predict(kyph.glm1, type="terms")[1:5,] # X*b new.d<- data.frame(Age=seq(10,150,10),Number=c(rep(1:5,3)),Start=1:15) predict(kyph.glm1,newdata=new.d,type="response", se=T)[1:10] plot(predict.glm(kyph.glm1, type="response"))