Symbolic data analysis (SDA) is an extension of standard data analysis where symbolic data tables are used as input and symbolic objects are made output as a result. The data units are called symbolic since they are more complex than standard ones, as they not only contain values or categories, but also include internal variation and structure.[1][2]
ggESDA is an extension of ggplot2 for visualizing the symbolic data based on exploratory data analysis (EDA).The package contains many useful functions for exploratory plots.Furthermore,the users can also transform the classical data into the symbolic data by the function in this package.
devtools::install_github("kiangkiangkiang/ggESDA")
The example data is called “facedata” which is from RSDA package.It will be the interval form with minimal and maximal.However,most of the symbolic data are not exist at the beginning.Instead,they are usually aggregated by clustering method from a classical data.Thus,we will use classic2sym()
function to transform classical data into symbolic data as the second example data.
library(ggESDA)
#aggregated by the variable Species in iris
<-classic2sym(iris,groupby = "Species")$intervalData
iris_interval
iris_interval#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> setosa [4.30 : 5.80] [2.30 : 4.40] [1.00 : 1.90] [0.10 : 0.60]
#> versicolor [4.90 : 7.00] [2.00 : 3.40] [3.00 : 5.10] [1.00 : 1.80]
#> virginica [4.90 : 7.90] [2.20 : 3.80] [4.50 : 6.90] [1.40 : 2.50]
class(iris_interval)
#> [1] "symbolic_tbl" "data.frame"
dim(iris_interval)
#> [1] 3 4
<-RSDA::facedata
myFacedatahead(myFacedata,5)
#> # A tibble: 5 x 6
#> AD BC AH DH
#> <symblc_n> <symblc_n> <symblc_n> <symblc_n>
#> 1 [155.00 : 157.00] [58.00 : 61.01] [100.45 : 103.28] [105.00 : 107.30]
#> 2 [154.00 : 160.01] [57.00 : 64.00] [101.98 : 105.55] [104.35 : 107.30]
#> 3 [154.01 : 161.00] [57.00 : 63.00] [99.36 : 105.65] [101.04 : 109.04]
#> 4 [168.86 : 172.84] [58.55 : 63.39] [102.83 : 106.53] [122.38 : 124.52]
#> 5 [169.85 : 175.03] [60.21 : 64.38] [102.94 : 108.71] [120.24 : 124.52]
#> # ... with 2 more variables: EH <symblc_n>, GH <symblc_n>
class(myFacedata)
#> [1] "symbolic_tbl" "tbl_df" "tbl" "data.frame"
dim(myFacedata)
#> [1] 27 6
With the symbolic data generated,you can start to visualize the symbolic data. But now, we are going to do with a real symbolic datasets, face recognition data and Environment data, by the following functions:
ggInterval_index(facedata, aes(x = AD))
You can also change fill=
and col=
to make the plot more visible,and set x or y axis to your variable will rotate the index line in figure.
<- mean(facedata$AD)
m <- as.factor(rep(c("FRA", "HUS", "INC", "ISA", "JPL", "KHA",
Concepts "LOT", "PHI", "ROM"), each = 3))
ggInterval_index(facedata, aes(x = AD, fill = Concepts))+
theme_bw() +
scale_fill_brewer(palette = "Set2")+
geom_segment(x = m, xend = m, y = 0, yend = 27,
lty = 2, col = "red", lwd = 1) +
geom_text(aes(x = m, y = 28), label = "Mean")+
scale_y_continuous(breaks = 1:27,
labels = rownames(facedata))
ggInterval_minmax(facedata, aes(x = AD, size = 3))+
scale_color_manual(values = c("darkblue", "darkred")) +
coord_fixed(ratio = 1)
theme_bw()
The width of rectangle in boxplot is not meaningful.Instead,it is used for showing the difference of each quantile,either is the fill color.
ggInterval_boxplot(facedata, plotAll = T) +
theme_bw()
You can also change the fill color and any aesthetic to which you like.
ggInterval_boxplot(data=myFacedata,aes(x=AD,col="black",lty=2,lwd=1.2))+
scale_fill_manual(values = c("red","yellow",
"green","blue","grey"),
labels=c("0%","25%","50%","75%","100%"),
name="quantile")
The histogram of interval data is to calculate the frequency of each interval in bins,and in ggInterval_hist()
function,there are two main parameter can be adjusted by user.One is the bins
,just like the geom_histogram do,and the default of bins is 10.The other is the method
,equal-bin
and inequal-bin
can be chosen,and the default of method is equal-bin
.
equal-bin method (default)
unequal-bin method method = unequal-bin
Note : The change of bins in unequal-bin method will be ignore because it will conflict with algorithm .
<- ggInterval_hist(facedata, plotAll = T) +
equal_bin theme_bw()
<- ggInterval_hist(facedata, plotAll = T,
unequal_bin method = "unequal-bin") +
theme_bw()
ggarrange(equal_bin, unequal_bin, ncol = 2)
ggInterval_centerRange(iris_interval,aes(x = Sepal.Width)) +
geom_text(label = rownames(iris_interval), vjust = -0.8) +
scale_x_continuous(limits = c(2.6, 3.4)) +
scale_y_continuous(limits = c(1.3, 2.2))
ggInterval_centerRange(myFacedata[11:20, ],aes(x = GH))+
geom_text(label = rownames(myFacedata)[11:20], vjust = -0.8, size = 3)
Because the value is an interval,it will be a nice way that the rectangle represents interval of two variables.
<- rep(RColorBrewer::brewer.pal(9, "Set1"), each = 3)
myCol ggInterval_scatter(data = facedata, aes(x = AD, y = BC)) +
scale_fill_manual(values = myCol, name = "CONCEPTS",
label = rownames(facedata)) +
theme_bw()
ggInterval_scaMatrix(facedata)
ggInterval_2Dhist(iris_interval, aes(x = Sepal.Length, y = Petal.Length))
It can be adjusted by ggplot2 function too.Using ?ggInterval_2Dhist()
will show more detail about it.
ggInterval_2Dhist(facedata, aes(x = BC, y = AH, col = "gray50")) +
scale_fill_gradient(
low = "gray85",
high = "red"
+
) theme_bw()
Note : It isn’t recommended to deal with too many variables because the time complexity in calculating full matrix will be too large.
ggInterval_2DhistMatrix(facedata,
xBins = 10,
yBins = 10,
removeZero = T,
addFreq = F)
ggInterval_indexImage(facedata, aes(x = AD)) +
coord_flip()
<- ggInterval_indexImage(facedata, plotAll = T, column_condition = T,
p1 full_strip = T)
<- ggInterval_indexImage(facedata, plotAll = T, column_condition = F,
p2 full_strip = T)
::ggarrange(p1, p2, ncol = 2) ggpubr
ggInterval_3Dscatter(iris_interval, aes(Sepal.Length, Petal.Length, Petal.Width))
If variance is too large(or small) or the difference between two variables are too large,it will be distortion or unidentifiable,which may happen in different units or others.
So,a standardizing way is necessary by using scale = TRUE
.
ggInterval_3Dscatter(myFacedata[1:8, ], aes(AD, BC, AH), scale = TRUE)
<- ggInterval_radar(Environment,
p1 plotPartial = 2,
showLegend = F,
base_circle = T,
base_lty = 2,
addText = F) +
labs(title = "") +
theme_bw() +
scale_fill_manual(values = c("gray50")) +
scale_color_manual(values = c("gray50"))
<- ggInterval_radar(Environment,
p2 plotPartial = 7,
showLegend = F,
base_circle = F,
base_lty = 1,
addText = T) +
labs(title = "") +
theme_bw() +
scale_fill_manual(values = c("gray50")) +
scale_color_manual(values = c("gray50"))
::ggarrange(p1, p2, ncol = 2) ggpubr
It can also plot partial observations by plotPartial =
,and the right hand of equation can put the row index of observations you want to see.
<- ggInterval_radar(Environment,
p1 plotPartial = c(1, 4),
showLegend = F,
addText = F) +
scale_fill_manual(values = c("darkblue", "darkred")) +
scale_color_manual(values = c("darkblue", "darkred"))
<- ggInterval_radar(Environment,
p2 plotPartial = c(1, 4),
showLegend = F,
addText = F,
base_circle = F,
base_lty = 1,
type = "rect") +
scale_fill_manual(values = c("darkblue", "darkred")) +
scale_color_manual(values = c("darkblue", "darkred"))
::ggarrange(p1, p2, ncol = 2) ggpubr
A quantile radar plot :
<- list(AbaloneIdt = AbaloneIdt,
dataSetList BLOOD = BLOOD,
Cardiological = Cardiological,
facedata = facedata,
oils = oils,
mushroom = mushroom,
Environment = Environment)
<- c("white", "gray10", "gray20",
myFill "gray30", "gray40", "gray50",
"gray60", "gray70", "white",
"white", "white")
<- myFill; myCol[1] <- "black"
myCol <- NULL
pList <- 1
u for(i in dataSetList){
<- ggInterval_radar(i,
p base_circle = F,
base_lty = 1,
type = "quantile",
quantileNum = 10,
showLegend = F,
Drift = 0)+
labs(title = names(dataSetList)[u]) +
scale_fill_manual(values = rev(myFill)) +
scale_color_manual(values = rev(myCol))
::theme_hc()
ggthemes<- p
pList[[u]] <- u + 1
u
}
::marrangeGrob(pList, nrow = 2, ncol = 4,
gridExtratop = "")
<- rep(c("FRA", "HUS", "INC", "ISA", "JPL", "KHA",
CONCEPT "LOT", "PHI", "ROM"), each = 3)
<- ggInterval_PCA(facedata, poly = T,
p concepts_group = CONCEPT)
$ggplotPCA <- p$ggplotPCA + theme(legend.position = "top") +
ptheme_bw()
<- ggInterval_PCA(facedata, poly = F,
p2 concepts_group = CONCEPT)
$ggplotPCA <- p2$ggplotPCA + theme(legend.position = "top") +
p2theme_bw()
::ggarrange(p$ggplotPCA, p2$ggplotPCA, ncol = 2) ggpubr
<- p2
myPCA $loadings
myPCA#>
#> Loadings:
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
#> [1,] 0.403 0.439 0.214 0.257 0.723 0.104
#> [2,] 0.306 0.497 -0.175 -0.779 -0.146
#> [3,] 0.523 -0.147 -0.463 0.273 -0.248 0.595
#> [4,] 0.557 0.206 0.283 -0.423 -0.620
#> [5,] -0.297 0.466 -0.700 0.329 -0.311
#> [6,] -0.268 0.563 0.421 0.253 -0.465 0.392
#>
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
#> SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
#> Proportion Var 0.167 0.167 0.167 0.167 0.167 0.167
#> Cumulative Var 0.167 0.333 0.500 0.667 0.833 1.000
cumsum(myPCA$sdev/sum(myPCA$sdev))
#> Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
#> 0.2938526 0.5401279 0.6873056 0.8139909 0.9203008 1.0000000
head(myPCA$scores_interval[,1:3])
#> PC_1 PC_2 PC_3
#> FRA1 [1.61 : 2.66] [0.27 : 1.57] [-1.00 : 0.29]
#> FRA2 [1.03 : 2.49] [-0.11 : 1.61] [-1.01 : 0.25]
#> FRA3 [0.81 : 2.99] [-0.40 : 1.87] [-1.20 : 0.88]
#> HUS1 [-1.10 : 0.24] [0.39 : 2.05] [-2.13 : -0.64]
#> HUS2 [-1.41 : 0.40] [0.56 : 2.65] [-2.32 : -0.29]
#> HUS3 [-1.42 : 0.24] [0.43 : 2.52] [-2.17 : -0.27]