In statistics, Cramer’s V (sometimes referred to as Cramer’s phi and denoted as \(\phi\) c) is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive). It is based on Pearson’s chi-squared statistic and was published by Harald Cramer in 1946. \(\phi\) c is the intercorrelation of two discrete variables. Cramer’s V varies from 0 (corresponding to no association between the variables) to 1 (complete association) and can reach 1 only when the two variables are equal to each other. \(\phi^2\) c is the mean square canonical correlation between the variables. Source: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
library(lsr)
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
airquality <- na.omit(airquality)
library(dplyr)
olddat <- airquality %>% filter(Month < 7) # split the data in two groups by month
newdat <- airquality %>% filter(Month >= 7)
old_cramers <- apply(olddat, 2, cramersV) # 2 indicates apply the function over the columns
new_cramers <- apply(newdat, 2, cramersV)
sort(old_cramers, decreasing = TRUE) # highest values first
## Ozone Solar.R Day Wind Temp Month
## 0.14679128 0.10615937 0.09027299 0.05254938 0.02168161 0.01493147
sort(new_cramers, decreasing = TRUE)
## Ozone Day Solar.R Wind Month Temp
## 0.08063118 0.06334230 0.04994634 0.04090122 0.01189213 0.01063208
old_new <- cbind(old_cramers, new_cramers) # bind the columns
# http://thecoatlessprofessor.com/programming/creating-stacked-barplot-and-grouped-barplot-in-r-using-base-graphics-no-ggplot2/
par(mar=c(5.1, 4.1, 4.1, 7.1), xpd=TRUE)
prop = prop.table(old_new,margin=2)
barplot(old_new, col=heat.colors(length(rownames(old_new))), main = "Stacked count")
legend("topright",inset=c(-.25,0), fill=heat.colors(length(rownames(old_new))), legend=rownames(old_new))
# unstacked barplot
barplot(prop, col=heat.colors(length(rownames(prop))), width=2, beside=TRUE, main = "Unstacked count")
legend("topright",inset=c(-.25,0), fill=heat.colors(length(rownames(prop))), legend=rownames(old_new))
# stacked percentage based
barplot(prop, col=heat.colors(length(rownames(prop))), width=2, main = "Stacked %")
legend("topright",inset=c(-0.25,0), fill=heat.colors(length(rownames(prop))), legend=rownames(old_new))
library(plotly)
library(reshape2)
# suggested layout from Paul to clearly see differences
p = melt(data = prop,
measure.vars = c("Va1", "Var2", "value"))
as.list(p)
## $Var1
## [1] Ozone Solar.R Wind Temp Month Day Ozone Solar.R
## [9] Wind Temp Month Day
## Levels: Ozone Solar.R Wind Temp Month Day
##
## $Var2
## [1] old_cramers old_cramers old_cramers old_cramers old_cramers
## [6] old_cramers new_cramers new_cramers new_cramers new_cramers
## [11] new_cramers new_cramers
## Levels: old_cramers new_cramers
##
## $value
## [1] 0.33949121 0.24551986 0.12153345 0.05014409 0.03453273 0.20877866
## [7] 0.31331908 0.19408302 0.15893519 0.04131447 0.04621081 0.24613744
plot_p = plot_ly(p,
x = Var1, y = value,
color = factor(Var2),
type ="bar")
plot_p