# Diagnostics in regression: A visual example

#### thedreamshaper

##### New Member
The follwing R-code/picture shows a simple linear regression with no outliers and 3 different kind of outliers. It also plots the diagnostic values, confidence bands and prediction bands for each case.

It is ment to explain it for students (such as my self), most teachers forget the value of graphic means.

Anyway, im a student my self and i hope it doesnt contain any major screwups.

Code:
par(mfrow=c(3,4))

#Ingen outlier
x<- c(3,1,2,5,7,2.5,10,12,15,17,13)
y<- c(rnorm(11,x,1))
plot(x,y,xlim=c(-5,25),ylim=c(-5,30),main=paste("No outlier"))
model <- lm(y~1+x)
abline(0,1)
colors <- c("red", "blue", "green", "orange", "black")
#finder confidens og prædiktion
confs <- predict(model,newdata=data.frame(x=seq(-500,1000,length=10000)),interval='confidence')
preds <- predict(model,newdata=data.frame(x=seq(-500,1000,length=10000)),interval='prediction')
lims <- c(min(preds[,2]),max(preds[,]))
#plotter confidens for linje
confs<-confs[order(confs[,1]),]
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),confs[,3],col="red")
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),confs[,2],col="red")
#Plotter prædiktion
preds <- preds[order(preds[,1]),]
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),preds[,2],col="blue")
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),preds[,3],col="blue")
#labels
labels <- c("confidence", "prediction")
legend("topleft", inset=.05, title="Intervals",
labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)
model1 <-model

#God outlier
x<- c(3,1,2,5,7,2.5,10,12,15,17,13)
x<- c(x,90)
y<- c(rnorm(11,x,1),90)
plot(x,y,xlim=c(-5,100),ylim=c(-5,100),main=paste("Outlier with values as expected"))
model <- lm(y~1+x)
abline(0,1)
colors <- c("red", "blue", "green", "orange", "black")
#finder confidens og prædiktion
confs <- predict(model,newdata=data.frame(x=seq(-500,1000,length=10000)),interval='confidence')
preds <- predict(model,newdata=data.frame(x=seq(-500,1000,length=10000)),interval='prediction')
lims <- c(min(preds[,2]),max(preds[,]))
#plotter confidens for linje
confs<-confs[order(confs[,1]),]
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),confs[,3],col="red")
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),confs[,2],col="red")
#Plotter prædiktion
preds <- preds[order(preds[,1]),]
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),preds[,2],col="blue")
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),preds[,3],col="blue")
#labels
labels <- c("confidence", "prediction")
legend("topleft", inset=.05, title="Intervals",
labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)
model2 <-model

#Dårlig outlier
x<- c(3,1,2,5,7,2.5,10,12,15,17,13)
x<- c(x,7.954545)
y<- c(rnorm(11,x,1),130)
plot(x,y,xlim=c(-5,75),ylim=c(-5,150),main=paste("Outlier ruins intercept"))
abline(9.67,1)
model <- lm(y~x)
#finder confidens og prædiktion
confs <- predict(model,newdata=data.frame(x=seq(-500,1000,length=10000)),interval='confidence')
preds <- predict(model,newdata=data.frame(x=seq(-500,1000,length=10000)),interval='prediction')
lims <- c(min(preds[,2]),max(preds[,]))
#plotter confidens for linje
confs<-confs[order(confs[,1]),]
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),confs[,3],col="red")
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),confs[,2],col="red")
#Plotter prædiktion
preds <- preds[order(preds[,1]),]
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),preds[,2],col="blue")
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),preds[,3],col="blue")
#labels
legend("topright", inset=.01, title="Intervals",
labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)
model3 <-model

#Dårlig outlier
x<- c(3,1,2,5,7,2.5,10,12,15,17,13)
x<- c(x,60)
y<- c(rnorm(11,x,1),15)
plot(x,y,xlim=c(-5,75),ylim=c(-5,130),main=paste("Outlier ruins slope"))
abline(6,0.23)
model <- lm(y~x)
#finder confidens og prædiktion
confs <- predict(model,newdata=data.frame(x=seq(-500,1000,length=10000)),interval='confidence')
preds <- predict(model,newdata=data.frame(x=seq(-500,1000,length=10000)),interval='prediction')
lims <- c(min(preds[,2]),max(preds[,]))
#plotter confidens for linje
confs<-confs[order(confs[,1]),]
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),confs[,3],col="red")
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),confs[,2],col="red")
#Plotter prædiktion
preds <- preds[order(preds[,1]),]
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),preds[,2],col="blue")
lines(sort(x=seq(-500,1000,length=10000),decreasing=FALSE),preds[,3],col="blue")
#labels
labels <- c("confidence", "prediction")
legend("topleft", inset=.05, title="Intervals",
labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)
model4 <-model

plot(hatvalues(model1),std.res<-((rstandard(model1)^2)^0.5),cex=cooks.distance(model1)*5,xlab="Hat values",ylab="Nummerical std. residuals",main="Bubbel plot - Cirkel= CooksD*5",xlim=c(min(hatvalues(model1),0),max(hatvalues(model1),0.7)),ylim=c(min(rstandard(model1)),max(rstandard(model1),3)))
plot(hatvalues(model2),std.res<-((rstandard(model2)^2)^0.5),cex=cooks.distance(model2)*5,xlab="Hat values",ylab="Nummerical std. residuals",main="Bubbel plot - Cirkel= CooksD*5",xlim=c(min(hatvalues(model2),0),max(hatvalues(model2),0.7)),ylim=c(min(rstandard(model2)),max(rstandard(model2),3)))
plot(hatvalues(model3),std.res<-((rstandard(model3)^2)^0.5),cex=cooks.distance(model3)*5,xlab="Hat values",ylab="Nummerical std. residuals",main="Bubbel plot - Cirkel= CooksD*5",xlim=c(min(hatvalues(model3),0),max(hatvalues(model3),0.7)),ylim=c(min(rstandard(model3)),max(rstandard(model3),3)))
plot(hatvalues(model4),std.res<-((rstandard(model4)^2)^0.5),cex=cooks.distance(model4)*5,xlab="Hat values",ylab="Nummerical std. residuals",main="Bubbel plot - Cirkel= CooksD*5",xlim=c(min(hatvalues(model4),0),max(hatvalues(model4),0.7)),ylim=c(min(rstandard(model4)),max(rstandard(model4),3)))

plot(dffits(model1),ylab="DFFITS",main="DFFITS",col="red",ylim=c(-2,2))
plot(dffits(model2),ylab="DFFITS",main="DFFITS",col="red",ylim=c(-2,2))
plot(dffits(model3),ylab="DFFITS",main="DFFITS",col="red",ylim=c(-2,60))
plot(dffits(model4),ylab="DFFITS",main="DFFITS",col="red",ylim=c(-80,2))

summary(influence.measures(model1))
rstandard(model1)
summary(influence.measures(model2))
rstandard(model2)
summary(influence.measures(model3))
rstandard(model3)
summary(influence.measures(model4))
rstandard(model4)

And ofcourse the actual image