Usage and interpretation

In statistics, Cramer’s V (sometimes referred to as Cramer’s phi and denoted as \(\phi\) c) is a measure of association between two nominal variables, giving a value between 0 and +1 (inclusive). It is based on Pearson’s chi-squared statistic and was published by Harald Cramer in 1946. \(\phi\) c is the intercorrelation of two discrete variables. Cramer’s V varies from 0 (corresponding to no association between the variables) to 1 (complete association) and can reach 1 only when the two variables are equal to each other. \(\phi^2\) c is the mean square canonical correlation between the variables. Source: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V


airquality dataset

library(lsr)

head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
airquality <- na.omit(airquality)

library(dplyr)

olddat <- airquality %>%  filter(Month < 7) # split the data in two groups by month 

newdat <- airquality %>% filter(Month >= 7)

old_cramers <- apply(olddat, 2, cramersV) # 2 indicates apply the function over the columns

new_cramers <- apply(newdat, 2, cramersV)

sort(old_cramers, decreasing = TRUE) # highest values first
##      Ozone    Solar.R        Day       Wind       Temp      Month 
## 0.14679128 0.10615937 0.09027299 0.05254938 0.02168161 0.01493147
sort(new_cramers, decreasing = TRUE)
##      Ozone        Day    Solar.R       Wind      Month       Temp 
## 0.08063118 0.06334230 0.04994634 0.04090122 0.01189213 0.01063208
old_new <- cbind(old_cramers, new_cramers) # bind the columns

Static graphics with base

# http://thecoatlessprofessor.com/programming/creating-stacked-barplot-and-grouped-barplot-in-r-using-base-graphics-no-ggplot2/
par(mar=c(5.1, 4.1, 4.1, 7.1), xpd=TRUE)
prop = prop.table(old_new,margin=2)

barplot(old_new, col=heat.colors(length(rownames(old_new))), main = "Stacked count")
legend("topright",inset=c(-.25,0), fill=heat.colors(length(rownames(old_new))), legend=rownames(old_new))

# unstacked barplot
barplot(prop, col=heat.colors(length(rownames(prop))), width=2, beside=TRUE, main = "Unstacked count")
legend("topright",inset=c(-.25,0), fill=heat.colors(length(rownames(prop))), legend=rownames(old_new))

# stacked percentage based
barplot(prop, col=heat.colors(length(rownames(prop))), width=2, main = "Stacked %")
legend("topright",inset=c(-0.25,0), fill=heat.colors(length(rownames(prop))), legend=rownames(old_new))

Dynamic grpahic with plotly

library(plotly)
library(reshape2)

# suggested layout from Paul to clearly see differences
p = melt(data = prop, 
                    measure.vars = c("Va1", "Var2", "value"))
as.list(p)
## $Var1
##  [1] Ozone   Solar.R Wind    Temp    Month   Day     Ozone   Solar.R
##  [9] Wind    Temp    Month   Day    
## Levels: Ozone Solar.R Wind Temp Month Day
## 
## $Var2
##  [1] old_cramers old_cramers old_cramers old_cramers old_cramers
##  [6] old_cramers new_cramers new_cramers new_cramers new_cramers
## [11] new_cramers new_cramers
## Levels: old_cramers new_cramers
## 
## $value
##  [1] 0.33949121 0.24551986 0.12153345 0.05014409 0.03453273 0.20877866
##  [7] 0.31331908 0.19408302 0.15893519 0.04131447 0.04621081 0.24613744
plot_p = plot_ly(p, 
                x = Var1, y = value, 
                color = factor(Var2), 
                type ="bar")
plot_p