Some example data science plots in R using ggplot2
. See
https://github.com/WinVector/WVPlots
for code/details.
set.seed(34903490)
x = rnorm(50)
y = 0.5*x^2 + 2*x + rnorm(length(x))
frm = data.frame(
x = x,
y = y,
yC = y>=as.numeric(quantile(y,probs=0.8)),
stringsAsFactors = FALSE)
frm$absY <- abs(frm$y)
frm$posY = frm$y > 0
Scatterplot with smoothing line through points.
Scatterplot with best linear fit through points. Also report the R-squared and significance of the linear fit.
WVPlots::ScatterHist(frm, "x", "y", smoothmethod="lm",
title="Example Linear Fit", estimate_sig = TRUE)
Scatterplot compared to the line x = y
. Also report the
coefficient of determination between x
and y
(where y
is “true outcome” and x
is “predicted
outcome”).
WVPlots::ScatterHist(frm, "x", "y", smoothmethod="identity",
title="Example Relation Plot", estimate_sig = TRUE)
Scatterplot of (x, y) color-coded by category/group, with marginal distributions of x and y conditioned on group.
set.seed(34903490)
fmScatterHistC = data.frame(
x=rnorm(50),
y=rnorm(50),
stringsAsFactors = FALSE)
fmScatterHistC$cat <- fmScatterHistC$x+fmScatterHistC$y>0
WVPlots::ScatterHistC(fmScatterHistC, "x", "y", "cat", title="Example Conditional Distribution")
Scatterplot of (x, y) color-coded by discretized z.
The continuous variable z is binned into three groups, and then
plotted as by ScatterHistC
set.seed(34903490)
frmScatterHistN = data.frame(
x=rnorm(50),
y=rnorm(50),
stringsAsFactors = FALSE)
frmScatterHistN$z <- frmScatterHistN$x+frmScatterHistN$y
WVPlots::ScatterHistN(frmScatterHistN, "x", "y", "z", title="Example Joint Distribution")
Plot the relationship y as a function of x with a
smoothing curve that estimates \(E[y |
x]\). If y is a 0/1 variable as below (binary
classification, where 1 is the target class), then the smoothing curve
estimates \(P(y | x)\). Since \(y \in \{0,1\}\) with \(y\) intended to be monotone in \(x\) is the most common use of this graph,
BinaryYScatterPlot
uses a glm
smoother by
default (use_glm=TRUE
, this is essentially Platt scaling),
as the best estimate of \(P(y |
x)\).
if(requireNamespace("hexbin", quietly = TRUE)) {
set.seed(5353636)
df = rbind(data.frame(x=rnorm(1000, mean = 1),
y=rnorm(1000, mean = 1, sd = 0.5 ),
stringsAsFactors = FALSE),
data.frame(x = rnorm(1000, mean = -1, sd = 0.5),
y = rnorm(1000, mean = -1, sd = 0.5),
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
print(WVPlots::HexBinPlot(df, "x", "y", "Two gaussians"))
}
set.seed(34903490)
y = abs(rnorm(20)) + 0.1
x = abs(y + 0.5*rnorm(20))
frm = data.frame(
model=x,
value=y,
stringsAsFactors = FALSE)
frm$costs=1
frm$costs[1]=5
frm$rate = with(frm, value/costs)
frm$isValuable = (frm$value >= as.numeric(quantile(frm$value, probs=0.8)))
Basic curve: each item “costs” the same. The wizard sorts by true value, the x axis sorts by the model, and plots the fraction of the total population.
We can annotate a point of the model at a specific x value
gainx = 0.10 # get the top 10% most valuable points as sorted by the model
# make a function to calculate the label for the annotated point
labelfun = function(gx, gy) {
pctx = gx*100
pcty = gy*100
paste("The top ", pctx, "% most valuable points by the model\n",
"are ", pcty, "% of total actual value", sep='')
}
WVPlots::GainCurvePlotWithNotation(frm, "model", "value",
title="Example Gain Curve with annotation",
gainx=gainx,labelfun=labelfun)
When the x
values have different costs, take that into
account in the gain curve. The wizard now sorts by value/cost, and the x
axis is sorted by the model, but plots the fraction of total cost,
rather than total count.
set.seed(34903490)
# data with two different regimes of behavior
frm <- rbind(
data.frame(
model = rnorm(1000),
isValuable = sample(c(TRUE, FALSE), prob = c(0.02, 0.98), size = 1000, replace = TRUE)),
data.frame(
model = rnorm(200) + 5,
isValuable = sample(c(TRUE, FALSE), size = 200, replace = TRUE))
)
WVPlots::ROCPlot(frm, "model", "isValuable", TRUE, title="Example ROC plot")
Plotting the ROC of two models on the same data, where predictions and true outcome all in the same data frame.
set.seed(34903490)
x1 = rnorm(50)
x2 = rnorm(length(x1))
y = 0.2*x2^2 + 0.5*x2 + x1 + rnorm(length(x1))
frmP = data.frame(
x1=x1,
x2=x2,
yC = y>=as.numeric(quantile(y,probs=0.8)),
stringsAsFactors = FALSE)
# WVPlots::ROCPlot(frmP, "x1", "yC", TRUE, title="Example ROC plot")
# WVPlots::ROCPlot(frmP, "x2", "yC", TRUE, title="Example ROC plot")
WVPlots::ROCPlotPair(frmP, "x1", "x2", "yC", TRUE, title="Example ROC pair plot")
Plotting the results from two data sets, for example the results of a model on training and test sets, where predictions/outcome for the two data sets are in different data frames.
set.seed(2342458)
make_data <- function(nrows) {
d <- data.frame(x = rnorm(nrows))
d['y'] = sin(d['x']) + 0.25*rnorm(n = nrows)
d['x2'] = rnorm(n = nrows)
d['yc'] = d[['y']]>0.5
return(d)
}
training <- make_data(500)
test <- make_data(200)
model <- glm(yc ~ x + x2, data=training, family=binomial)
training$pred <- predict(model, newdata=training, type="response")
test$pred <- predict(model, newdata=test, type="response")
WVPlots::ROCPlotPair2(nm1 = "Training", # model 1
frame1 = training,
xvar1 = "pred", truthVar1 = "yc", truthTarget1 = TRUE,
nm2 ="Test", # model 2
frame2 = test,
xvar2 = "pred", truthVar2 = "yc", truthTarget2 = TRUE,
title = "Model performance, training vs test",
estimate_sig = FALSE)
Many ROC
plots on the same graph.
set.seed(34903490)
x1 = rnorm(50)
x2 = rnorm(length(x1))
x3 = rnorm(length(x1))
y = 0.2*x2^2 + 0.5*x2 + x1 + rnorm(length(x1))
frm_m = data.frame(
x1 = x1,
x2 = x2,
x3 = x3,
yC = y >= as.numeric(quantile(y,probs=0.8)))
WVPlots::ROCPlotPairList(
frame = frm_m,
xvar_names = c("x1", "x2", "x3"),
truthVar = "yC", truthTarget = TRUE,
title = "Example ROC list plot")
Plots precision and recall as functions of different classifier thresholds.
ThresholdPlot()
plots a variety of functions of
different classifier thresholds.
# replicate PRTPlot. Looks a little different because ThresholdPlot does different smoothing
WVPlots::ThresholdPlot(frm, "model", "isValuable", title="Reproduce PRTPlot",
truth_target=TRUE, # default
metrics = c("precision", "recall"))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
# default: sensitivity/specificity
WVPlots::ThresholdPlot(frm, "model", "isValuable",
title="Sensitivity and Specificity as a Function of Threshold")
One useful application of ThresholdPlot
is to “unroll”
an ROC plot: if the ROC shows that your model can meet an acceptable
trade-off of true positive rate and false positive rate, then
ThresholdPlot
can tell you which threshold achieves that
goal.
WVPlots::ThresholdPlot(frm, "model", "isValuable", title="ROC 'unrolled'",
metrics = c("true_positive_rate", "false_positive_rate"))
An extended example can be found here.
set.seed(34903490)
# discrete variable: letters of the alphabet
# frequencies of letters in English
# source: http://en.algoritmy.net/article/40379/Letter-frequency-English
letterFreqs = c(8.167, 1.492, 2.782, 4.253, 12.702, 2.228,
2.015, 6.094, 6.966, 0.153, 0.772, 4.025, 2.406, 6.749, 7.507, 1.929,
0.095, 5.987, 6.327, 9.056, 2.758, 0.978, 2.360, 0.150, 1.974, 0.074)
letterFreqs = letterFreqs/100
letterFrame = data.frame(
letter = letters,
freq=letterFreqs,
stringsAsFactors = FALSE)
# now let's generate letters according to their letter frequencies
N = 1000
randomDraws = data.frame(
draw=1:N,
letter=sample(letterFrame$letter, size=N, replace=TRUE, prob=letterFrame$freq),
stringsAsFactors = FALSE)
WVPlots::ClevelandDotPlot(randomDraws, "letter", title = "Example Cleveland-style dot plot")
WVPlots::ClevelandDotPlot(randomDraws, "letter", limit_n = 10, title = "Top 10 most frequent letters")
WVPlots::ClevelandDotPlot(randomDraws, "letter", sort=0, title="Example Cleveland-style dot plot, unsorted")
WVPlots::ClevelandDotPlot(randomDraws, "letter", sort=1, stem=FALSE, title="Example with increasing sort order + coord_flip, no stem") + ggplot2::coord_flip()
ClevelandDotPlot
also accepts an integral x
variable. You probably want sort = 0
in this case.
set.seed(34903490)
N = 1000
ncar_vec = 0:5
prob = c(1.5, 3, 3.5, 2, 1, 0.75); prob = prob/sum(prob)
df = data.frame(
num_cars = sample(ncar_vec, size = N, replace = TRUE, prob=prob),
stringsAsFactors = FALSE)
WVPlots::ClevelandDotPlot(df, "num_cars", sort = 0, title = "Distribution of household vehicle ownership")
Plot a bar chart of row counts conditioned on the categorical
variable condvar
, faceted on a second categorical variable,
refinevar
. Each faceted plot also shows a “shadow plot” of
the totals conditioned on condvar
alone.
This plot enables comparisons of sub-population totals across both
condvar
and refinevar
simultaneously.
set.seed(354534)
N = 100
# rough proportions of eye colors
eprobs = c(0.37, 0.36, 0.16, 0.11)
eye_color = sample(c("Brown", "Blue", "Hazel", "Green"), size = N, replace = TRUE, prob = eprobs)
sex = sample(c("Male", "Female"), size = N, replace = TRUE)
# A data frame of eye color by sex
dframe = data.frame(
eye_color = eye_color,
sex = sex,
stringsAsFactors = FALSE)
WVPlots::ShadowPlot(dframe, "eye_color", "sex", title = "Shadow plot of eye colors by sex")
Plot a histogram of a continuous variable xvar
, faceted
on a categorical conditioning variable, condvar
. Each
faceted plot also shows a “shadow plot” of the unconditioned histogram
for comparison.
set.seed(354534)
N = 100
dframe = data.frame(
x = rnorm(N),
gp = "region 2",
stringsAsFactors = FALSE)
dframe$gp = with(dframe, ifelse(x < -0.5, "region 1",
ifelse(x > 0.5, "region 3", gp)))
WVPlots::ShadowHist(dframe, "x", "gp", title = "X values by region")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the WVPlots package.
## Please report the issue at <https://github.com/WinVector/WVPlots/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ShadowHist
uses the Brewer Dark2 palette by default. You
can pass in another Brewer palette to change the color scheme. If you
prefer all the histograms to be the same color, set
monochrome=TRUE
.
To use a non-Brewer palette, such as viridis, or a manual color map,
set palette=NULL
. Here’s an example of setting the color
palette manually.
classes = c("a", "b", "c")
means = c(2, 4, 3)
names(means) = classes
label = sample(classes, size=1000, replace=TRUE)
meas = means[label] + rnorm(1000)
frm2 = data.frame(label=label,
meas = meas,
stringsAsFactors = FALSE)
WVPlots::ScatterBoxPlot(frm2, "label", "meas", pt_alpha=0.2, title="Example Scatter/Box plot")
Compare to a binomial with the same success rate as the observed data
set.seed(13951)
trial_size = 20 # one trial is 20 flips
ntrial = 100 # run 100 trials
true_frate = 0.4 # true heads probability
fdata = data.frame(n_heads = rbinom(ntrial, trial_size, true_frate),
stringsAsFactors = FALSE)
title = paste("Distribution of head counts, trial size =", trial_size)
# compare to empirical p
WVPlots::PlotDistCountBinomial(fdata, "n_heads", trial_size, title)
Compare to a binomial with a specified success rate
set.seed(349521)
N = 100 # number of cohorts
psucc = 0.15 # true success rate in population
group_size = round(runif(N, min=25, 50)) # sizes of observed sample groups
nsucc = rbinom(N, group_size, psucc) # successes in each group
hdata = data.frame(n_success=nsucc,
group_size=group_size,
stringsAsFactors = FALSE)
# observed rate of successes in each group
hdata$rate_success = with(hdata, n_success/group_size)
title = "Observed prevalence of success in population"
WVPlots::PlotDistHistBeta(hdata, "rate_success", title)
y = c(1,2,3,4,5,10,15,18,20,25)
x = seq_len(length(y))
df = data.frame(x=x,
y=y,
stringsAsFactors = FALSE)
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="centered smooth, one group")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="left smooth, one group", align="left")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="right smooth, one group", align="right")
n = length(x)
df = rbind(data.frame(x=x, y=y+rnorm(n), gp="times 1", stringsAsFactors = FALSE),
data.frame(x=x, y=0.5*y + rnorm(n), gp="times 1/2", stringsAsFactors = FALSE),
data.frame(x=x, y=2*y + rnorm(n), gp="times 2", stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="centered smooth, multigroup")