Hands on Churn Prediction with R and comparison of Different Models for Churn Prediction
Using GLM, Decision Tree and Random Forest to predict Churn and compare the models with their accuracy and AUC values
What is Churn ?
Churn rate, when applied to a customer base, refers to the proportion of contractual customers or subscribers who leave a supplier during a given time period.(wikipedia)
Why Churn Prediction is important?
Churn is directly related to the profitability of a company. The more some can learn about customer behaviors, the more profit can be gained. This also helps identifying and improving areas or fields where customer service is lacking.
Getting and Processing Data
The data was collected from IBM. Firstly load the libraries required and the data and take a look on the data.
library(tidyverse)
library(caret)
library(repr)
library(caTools)
library(rpart)
library(rpart.plot)
library(ggpubr)# input the data and take a look on the variables
data <- read.csv("telco.csv")
glimpse(data)data <- data[complete.cases(data),] # removing na's
About the Data:
- In this data, all the rows represents different customer and each column represents their attributes.
- The column Churn indicates customers leaving on the last month.
- Customers’ subscription to the services — phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies.
- Account informations of each customer — contract, payment method, paperless billing, monthly charges, and total charges.
- Demographic informations — gender, age range, and if they have partners and dependents
Data Wrangling:
- Terms like “No internet service” and “No phone service” should be changed to “No” for convenience.
data <- data.frame(lapply(data, function(x) {
gsub("No internet service", "No", x)}))data <- data.frame(lapply(data, function(x) {
gsub("No phone service", "No", x)}))
2. SeniorCitizen responds in 1 and 0, which is changed to “Yes” or “No”.
data$SeniorCitizen <- as.factor(ifelse(data$SeniorCitizen==1, 'YES', 'NO'))
3. Convert double type variables into numeric type and store them in a variable as a data frame.
num_columns <- c("tenure", "MonthlyCharges", "TotalCharges")
data[num_columns] <- sapply(data[num_columns], as.numeric)data_int <- data[,c("tenure", "MonthlyCharges", "TotalCharges")]
data_int <- data.frame(scale(data_int))
4. Tenure is in months. We should convert it into years.
data <- mutate(data, tenure_year = tenure)data$tenure_year[data$tenure_year >=0 & data$tenure_year <= 12] <- '0-1 year'
data$tenure_year[data$tenure_year > 12 & data$tenure_year <= 24] <- '1-2 years'
data$tenure_year[data$tenure_year > 24 & data$tenure_year <= 36] <- '2-3 years'
data$tenure_year[data$tenure_year > 36 & data$tenure_year <= 48] <- '3-4 years'
data$tenure_year[data$tenure_year > 48 & data$tenure_year <= 60] <- '4-5 years'
data$tenure_year[data$tenure_year > 60 & data$tenure_year <= 72] <- '5-6 years'data$tenure_year <- as.factor(data$tenure_year)
5. Prepare the final data for analysis excluding selected variables which we don’t need throughout the analsis process.
data$tenure_year <- as.factor(data$tenure_year)data_req <- data[,-c(1,6,19,20)]
x <- data.frame(sapply(data_req,function(x) data.frame(model.matrix(~x-1,data =data_req))[,-1]))x <- na.omit(x) # omit the NA's
data_int <- na.omit(data_int) # omit the NA'sdata_final <- cbind(x, data_int)
Exploratory Analysis
- Correlation plot between the numeric variables:
nv <- sapply(data_int, is.numeric)
cormat <- cor(data_int[,nv])
ggcorrplot::ggcorrplot(cormat, title = "Correlation of Numeric Variables")
2. Churn percentage tells around 26.58% of the customer left during last month.
churn <- data %>%
group_by(Churn) %>%
summarise(Count = n())%>%
mutate(percentage = prop.table(Count)*100)
ggplot(churn, aes(reorder(Churn, -percentage), percentage), fill = Churn)+
geom_col(fill = c("green", "red"))+
geom_text(aes(label = sprintf("%.2f%%", percentage)))+
xlab("Churn") +
ylab("Percent")+
ggtitle("Churn Percentage")
3. Bar plots to show churn rate in categorical variables.
fig1 <- ggarrange(ggplot(data, aes(x=gender,fill=Churn))+ geom_bar() ,
ggplot(data, aes(x=SeniorCitizen,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=Partner,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=Dependents,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=PhoneService,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=MultipleLines,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=InternetService,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=OnlineSecurity,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=OnlineBackup,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=DeviceProtection,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=TechSupport,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=StreamingTV,fill=Churn))+ geom_bar(position = 'fill'),
ggplot(data, aes(x=StreamingMovies,fill=Churn))+
geom_bar(position = 'fill'),
ggplot(data, aes(x=Contract,fill=Churn))+
geom_bar(position = 'fill'),
ggplot(data, aes(x=PaperlessBilling,fill=Churn))+
geom_bar(position = 'fill'),
ggplot(data, aes(x=PaymentMethod,fill=Churn))+
geom_bar(position = 'fill')+theme_bw()+
rremove("x.text"),
ncol = 3, nrow = 6,
common.legend = TRUE, legend = "bottom")
annotate_figure(fig1, bottom = text_grob("Churn Percentage in categorical variables", col = "blue", face = "bold", size = 14))
4. Bar plot to show churn percentage in numeric variables.
fig2 <- ggarrange(
ggplot(data, aes(y= tenure, x = "", fill = Churn))
+geom_boxplot() + xlab(" "),
ggplot(data, aes(y= MonthlyCharges, x = "", fill = Churn)) +geom_boxplot() + xlab(" "),
ggplot(data, aes(y= TotalCharges, x = "", fill = Churn))
+geom_boxplot() + xlab(" "),
rremove("x.text"),
ncol = 2, nrow = 2, common.legend = TRUE, legend = "bottom")annotate_figure(fig2, bottom = text_grob("Churn Percentage in numeric variables", col = "red", face = "bold", size = 14))
Analysis With Different Models
Logistic Regression
- Split the data in train and test sets
set.seed(123)
split <- sample.split(data_final$Churn, SplitRatio = 0.70)
train <- data_final[split,]
test <- data_final[!(split),]
2. Calculate the baseline accuray
prop.table(table(train$Churn))
3. Fitting the model using glm() function
glm <- glm(Churn ~., data = train, family = "binomial")
summary(glm)
4. Measuring accuracy
pred <- predict(glm, data = train, type = "response")# confusion matrix on training setglmtab1 <- table(train$Churn, pred >= 0.5)
acc_glm_train <- (3275+708)/nrow(train)# observations on the test set
predtest <- predict(glm, newdata = test, type = "response")
glmtab2 <- table(test$Churn, predtest >= 0.5)
acc_glm_test <- (1382+307)/nrow(test)
Accuracy = 0.80
5. Important Variables
Random Forest
- Fitting the model
fit_rf <- randomForest(Churn ~ ., data=train, proximity=FALSE,importance = FALSE)
fit_rf
2. Calculating accuracy
predrf <- predict(fit_rf, data = "train", type = "response")rftab <- table(predrf, train$Churn)
acc_rf_train <- (3248+694)/nrow(train)
acc_rf_train
Accuracy = 0.80
3. Error and important variables plot
plot(fit_rf)
varImpPlot(rf)
Decission Tree
- Fit the model
rpart <- rpart(Churn ~. , data = train, method = "class", control = rpart.control((cp = 0.05)))
summary(rpart)
2. Calculatuing accuracy
rpred <- predict(rpart, data = train, type = "class")
dtab1 <- table(rpred, train$Churn)
acc_rpart_train <- (3380+522)/nrow(train)
rpredt <- predict(rpart, newdata = test, type = "class")
dtab2 <- table(rpredt, test$Churn)
acc_rpredt_test <- (1435+218)/nrow(test)
Accuracy = 0.78
3. Important variables
AUC for all three models
Reference for this code: rstudio-pubs.
library(pROC)glm.roc <- roc(response = test$Churn, predictor = as.numeric(predtest))
rpart.roc <- roc(response = train$Churn, predictor = as.numeric(predrf))
rf.roc <- roc(response = test$Churn, predictor = as.numeric(rpredt))plot(glm.roc, legacy.axes = TRUE, print.auc.y = 1.0, print.auc = TRUE)
plot(rpart.roc, col = "blue", add = TRUE, print.auc.y = 0.65, print.auc = TRUE)
plot(rf.roc, col = "red" , add = TRUE, print.auc.y = 0.85, print.auc = TRUE)legend("bottom", c("Random Forest", "Decision Tree", "Logistic"),
lty = c(1,1), lwd = c(2, 2), col = c("red", "blue", "black"), cex = 0.75)
Conclusion:
- Comparing accuracy and AUC value, Logistic Model performs better than Random Forest and Decision Tree to predict churn in this particular dataset.
- Tenure, Contract, PaperlessBilling, InternetService are of the most important features.
- Some features like gender, partner etc have no impact on churn.
Other Stories: