- Provide an overview of hierarchical clustering
- K-means vs. hierarchical clustering
- Examples
This procedure starts with each point as a singleton cluster, and then repeatedly combines the two nearest clusters until a single cluster containing all points remain. This can be visualized using a Dendrogram. From the Dendrogram, we can then see the optimal clusters.
Here we are going to perform cluster analysis on the NFL Running Backs 2016 data set. This data contain the seasonal statistics of each starting running back (32 total) for the 2016 - 2017 NFL season.
# Load NFL RB data nfl <- read.csv("NFL Running Backs 2016.csv") # Remove categorical data nfl.rb <- subset(nfl, select = -c(Player,Year,Team,Pos)) # Name rows of data set rownames(nfl.rb) <- nfl$Player # Center and scale data. Important!! nfl.rb <- scale(nfl.rb) # Perform hierarchical clustering # Need to use dist() i.e. distance matrix nfl.rb.cluster <- hclust(dist(nfl.rb))
# Display results nfl.rb.cluster
## ## Call: ## hclust(d = dist(nfl.rb)) ## ## Cluster method : complete ## Distance : euclidean ## Number of objects: 32
Visualizing Dendrogram
# Plots Dendrogram plot(nfl.rb.cluster, sub = "")
From this Dendrogram we can visually see each cluster.
If we want to separate this into n groups, we use the following code:
# Select 3 clusters nfl.rb.clustercut <- cutree(nfl.rb.cluster,3) # Sort by cluster sort(nfl.rb.clustercut)
## JohnsonDa ElliotEz BellLe FreemanDe WestTe ## 1 1 1 2 2 ## McCoyLe StewartJo HowardJo MillerLa GoreFr ## 2 2 2 2 2 ## GurleyTo AjayiJa IngramMa LeGarretteBl MurrayLa ## 2 2 2 2 2 ## MelvinGo HydeCa MurrayDe HillJe JohnsonDu ## 2 2 2 3 3 ## AndersonCJ RiddickTh LacyEd IvoryCh CharlesJa ## 3 3 3 3 3 ## PetersonAd JenningsRa ForteMa MathewsRy RawlsTh ## 3 3 3 3 3 ## MartinDo KelleyRo ## 3 3
K-means requires the user to specify the number of clusters, k. However, finding the optimal k may be difficult in practice, but gives the user more flexibility. Hierarchical clustering determines the best clusters based upon incremental creation.
# Load data nfl2 <- read.csv("NFL Running Backs 2016.csv") # Remove categorical data nfl2.rb <- subset(nfl2, select = -c(Player,Year,Team,Pos)) # Rename rows rownames(nfl2.rb) <- nfl2$Player # Center and scale. Important!! nfl2.rb <- scale(nfl2.rb) # K means with 3 clusters nfl2.rb.cluster <- kmeans(nfl2.rb,centers = 3)
# Display clusters sort(nfl2.rb.cluster$cluster)
## JohnsonDu AndersonCJ RiddickTh LacyEd IvoryCh ## 1 1 1 1 1 ## CharlesJa PetersonAd RawlsTh MartinDo JohnsonDa ## 1 1 1 1 2 ## FreemanDe McCoyLe HowardJo ElliotEz IngramMa ## 2 2 2 2 2 ## BellLe MelvinGo MurrayDe WestTe StewartJo ## 2 2 2 3 3 ## HillJe MillerLa GoreFr GurleyTo AjayiJa ## 3 3 3 3 3 ## LeGarretteBl JenningsRa ForteMa MurrayLa MathewsRy ## 3 3 3 3 3 ## HydeCa KelleyRo ## 3 3
As we can see, using 3 clusters, the results of the clusters are different using K-means and hierachical. Using specialty domain knowledge, someone may choose one clustering model over the other.
Using the NFL Wide Receivers 2016.csv data set, complete the following:
2 Clusters
# Load libraries library(ggplot2) library(ggrepel) # Load NFL WR data nfl <- read.csv("NFL Wide Receivers 2016.csv") # Remove categorical data nfl.wr <- subset(nfl, select = -c(Player,Year,Team,Pos)) # Rename rows of data frame rownames(nfl.wr) <- nfl$Player # Center and scale. Important!! nfl.wr <- scale(nfl.wr) # Hierachical cluster with dist() function applied nfl.wr.cluster <- hclust(dist(nfl.wr)) # Select 2 clusters nfl.rb.clustercut <- cutree(nfl.wr.cluster,2) # Create data frame of NFL and cluster data df <- data.frame(nfl,as.factor(nfl.rb.clustercut)) # Rename last column cluster names(df)[ncol(df)] <- "Cluster"
# Create plot of TD vs. Rec ggplot(df, aes(Rec, TD,colour = df$Cluster)) + geom_text_repel(aes(label=Player))+ theme(legend.position="none") + ggtitle("Receiving Touchdowns vs. Receptions Using HC - 2 Clusters")
3 Clusters
nfl.rb.clustercut <- cutree(nfl.wr.cluster,3) df <- data.frame(nfl,as.factor(nfl.rb.clustercut)) names(df)[ncol(df)] <- "Cluster" ggplot(df, aes(Rec, TD,colour = df$Cluster)) + geom_text_repel(aes(label=Player))+ theme(legend.position="none") + ggtitle("Receiving Touchdowns vs. Receptions Using HC - 3 Clusters")
4 Clusters
nfl.rb.clustercut <- cutree(nfl.wr.cluster,4) df <- data.frame(nfl,as.factor(nfl.rb.clustercut)) names(df)[ncol(df)] <- "Cluster" ggplot(df, aes(Rec, TD,colour = df$Cluster)) + geom_text_repel(aes(label=Player))+ theme(legend.position="none") + ggtitle("Receiving Touchdowns vs. Receptions Using HC - 4 Clusters")
5 Clusters
nfl.rb.clustercut <- cutree(nfl.wr.cluster,5) df <- data.frame(nfl,as.factor(nfl.rb.clustercut)) names(df)[ncol(df)] <- "Cluster" ggplot(df, aes(Rec, TD,colour = df$Cluster)) + geom_text_repel(aes(label=Player))+ theme(legend.position="none") + ggtitle("Receiving Touchdowns vs. Receptions Using HC - 5 Clusters")