Tornado and Importance Plots

Overview

A tornado plot is a visualization of the range of outputs expected from a variety of inputs, or alternatively, the sensitivity of the output to the range of inputs. Tornado plots have a number of features:

An importance plot attempts to dispay the relative impact of each variable on the model fit. Traditionally, for linear models, the concept of importance was expressed as the percentage of total response variable variance that is explained by each variable, either alone, or in the presence of the other variables in the model. Each model type can have different measures of importance. In this package, the following methods are used:

Data Sets

A few standard data sets are used in these examples:

mtcars_w_factors <- mtcars
# Automatic vs Manual
mtcars_w_factors$am <- factor(mtcars$am)
# V or Straight cylinder arrangement
mtcars_w_factors$vs <- factor(mtcars$vs)
# number of cylinders
mtcars_w_factors$cyl <- factor(mtcars$cyl)
# number of forward gears
mtcars_w_factors$gear <- factor(mtcars$gear)
# number of carburetors
mtcars_w_factors$carb <- factor(mtcars$carb)

Tornado Plot Examples

LM

type = “PercentChange”

lm1 <- lm(mpg ~ cyl*wt*hp, data = mtcars)
torn1 <- tornado::tornado(lm1, type = "PercentChange", alpha = 0.10)
plot(torn1, xlabel = "MPG", geom_bar_control = list(width = 0.4))

type = “rangess”

torn2 <- tornado::tornado(lm1, type = "ranges")
plot(torn2, xlabel = "MPG", geom_bar_control = list(width = 0.4))

type = “percentiles”

torn3 <- tornado::tornado(lm1, type = "percentiles", alpha = 0.05)
plot(torn3, xlabel = "MPG", geom_bar_control = list(width = 0.4))

Including Categorical Variables

lm4 <- lm(mpg ~ cyl + wt + hp + vs, data = mtcars_w_factors)
torn4 <- tornado::tornado(lm4, type = "percentiles", alpha = 0.05)
plot(torn4, xlabel = "MPG", geom_bar_control = list(width = 0.4))

Changing Variable Names

dict <- list(old = c("cyl", "wt", "hp", "vs"),
             new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight"))
torn5 <- tornado::tornado(lm4, type = "percentiles", alpha = 0.05, dict = dict)
plot(torn5, xlabel = "MPG", geom_bar_control = list(width = 0.4))

Plotting Options

dict <- list(old = c("cyl", "wt", "hp", "vs"),
             new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight"))
torn5 <- tornado::tornado(lm4, type = "percentiles", alpha = 0.05)
plot(torn5, xlabel = "MPG", geom_bar_control = list(width = 0.4),
     sensitivity_colors = c("#FC8D62", "#66C2A5"),
     geom_point_control = list(size = 3, fill = "purple", col = "purple"))

Extending the ggplot

Notes:

  • The plot center is at 0 and the x-axis is relabeled. So Adding lines to the plot should use a zero centered coordinate system
  • Also, the plot is drawn using coord_flip, so the horizonal line below is draw vertically
g <- plot(torn5, plot = FALSE, xlabel = "MPG", geom_bar_control = list(width = 0.4),
          sensitivity_colors = c("#FC8D62", "#66C2A5"),
          geom_point_control = list(size = 3, fill = "purple", col = "purple"))
g <- g + ggtitle("Test Plot")
g <- g + geom_hline(yintercept = 0, col = "black", lwd = 2)
plot(g)

GLM

Predict if an engine is a V or Straight given covariates

glm1 <- glm(vs ~ wt + disp + cyl, data = mtcars, family = binomial(link = "logit"))
torn1 <- tornado::tornado(glm1, type = "ranges", alpha = 0.10)
plot(torn1, xlabel = "V or Straight Engine", geom_bar_control = list(width = 0.4))

Censored Data

Surival Regression or Accelerated Failure

survreg1 <- survival::survreg(survival::Surv(futime, fustat) ~ ecog.ps + rx + age + resid.ds, 
                              survival::ovarian, dist = 'weibull', scale = 1)
torn1 <- tornado::tornado(survreg1, modeldata = survival::ovarian, 
                          type = "PercentChange", alpha = 0.10)
plot(torn1, xlabel = "Survival Time", geom_bar_control = list(width = 0.4))

Cox Proportional Hazards

coxph1 <- survival::coxph(survival::Surv(stop, event) ~ rx + size + number,
                          survival::bladder)
torn1 <- tornado::tornado(coxph1, modeldata = survival::bladder, type = "PercentChange",
                          alpha = 0.10)
plot(torn1, xlabel = "Risk", geom_bar_control = list(width = 0.4))

Ridge Regression and LASSO

if (has_glmnet)
{
  form <- formula(mpg ~ cyl*wt*hp)
  mf <- model.frame(form, data=mtcars)
  mm <- model.matrix(form, mf)
  gtest <- glmnet::cv.glmnet(x = mm, y = mtcars$mpg, family = "gaussian")
  torn <- tornado::tornado(gtest, modeldata = mtcars, 
                           form = formula(mpg ~ cyl*wt*hp), 
                           s = "lambda.1se",
                           type = "PercentChange", alpha = 0.10)
  plot(torn, xlabel = "MPG", geom_bar_control = list(width = 0.4))
} else
{
  print("glmnet is not available for vignette rendering")
}

Machine Learning Models

Regression

if (has_caret)
{
  gtest <- caret::train(x = subset(mtcars_w_factors, select = -mpg), 
                        y = mtcars_w_factors$mpg, method = "rf")
  torn <- tornado::tornado(gtest, type = "percentiles", alpha = 0.10)
  plot(torn, xlabel = "MPG")
} else
{
  print("caret is not available for vignette rendering")
}
#> Loading required package: lattice

Classification

The plot method can also return a ggplot object

if (has_caret)
{
  gtest <- caret::train(x = subset(iris, select = -Species), 
                        y = iris$Species, method = "rf")
  torn <- tornado::tornado(gtest, type = "percentiles", alpha = 0.10, class_number = 1)
  g <- plot(torn, plot = FALSE, xlabel = "Probability of the Setosa Species")
  g <- g + ggtitle("Classifier caret::train 'rf', 10th to 90th perc. of each var.")
  plot(g)

  torn <- tornado::tornado(gtest, type = "percentiles", alpha = 0.10, class_number = 2)
  g <- plot(torn, plot = FALSE, xlabel = "Probability of the versicolor Species")
  plot(g)
} else
{
  print("caret is not available for vignette rendering")
}

Importance Plots

Linear Models

gtest <- lm(mpg ~ cyl*wt*hp + gear + carb, data = mtcars)
gtestreduced <- lm(mpg ~ 1, data = mtcars)
imp <- tornado::importance(gtest, gtestreduced)
plot(imp)

Using a dictionary to translate the variable names and chaning colors

dict <- list(old = c("cyl", "wt", "hp", "vs", "gear", "carb"),
             new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight", "Num Gears", "Num Carbs"))
imp <- tornado::importance(gtest, gtestreduced, dict = dict)
plot(imp, col_importance_alone = "#8DD3C7", 
     col_importance_cumulative = "#FFFFB3")

Generalized Linear Models

gtest <- glm(vs ~ wt + disp + gear, data = mtcars, family = binomial(link = "logit"))
gtestreduced <- glm(vs ~ 1, data = mtcars, family = binomial(link = "logit"))
imp <- tornado::importance(gtest, gtestreduced)
plot(imp)

Censored Data

model_final <- survival::survreg(survival::Surv(futime, fustat) ~ ecog.ps*rx + age,
                                 data = survival::ovarian,
                                 dist = "weibull")
imp <- tornado::importance(model_final, survival::ovarian, nperm = 100)
plot(imp, geom_bar_control = list(width = 0.4, fill = "blue"))

Machine Learning

Regression

The number of variables plotted can also be controlled

if (has_caret)
{
  gtest <- caret::train(x = subset(mtcars_w_factors, select = -mpg), 
                        y = mtcars_w_factors$mpg, method = "rf")
  imp <- tornado::importance(gtest)
  plot(imp, nvar = 7)
} else
{
  print("caret is not available for vignette rendering")
}

Classification

The plot method can also return a ggplot object

if (has_caret)
{
  gtest <- caret::train(x = subset(iris, select = -Species), 
                        y = iris$Species, method = "rf")
  imp <- tornado::importance(gtest)
  g <- plot(imp, plot = FALSE)
  g <- g + ggtitle("Classifier caret::train randomforest: variable importance")
  plot(g)
} else
{
  print("caret is not available for vignette rendering")
}