Hierarchical Clustring in python

\(\)

Hierarchical Clustering is a method of clustering which build a hierarchy of clusters. It could be Agglomerative or Divisive.

Agglomerative: At the first step, every item is a cluster, then clusters based on their distances are merged and form bigger clusters till all data is in one cluster (Bottom Up). The complexity is \( O (n^2log(n) ) \).
Divisive: At the beginning, all items are in one big cluster. Then iteratively we break this cluster into smaller clusters (Top Down). The complexity is \( O (2^n) \).

To merge or divide the clusters we need to know the shortest distance between clusters. The common metrics for the distance between clusters are:

Single Link: smallest distance between points.
Complete Link: largest distance between points.
Average Link: average distance between points
Centroid: distance between centroids.

Depending on the definition of ‘shortest distance’ (single/ complete/ average/ centroid link ) we have different hierarchical clustering method.

Hierarchical Algorithms:

Single Link: at each iteration, two clusters that have the closest pair of elements will be merged into a bigger cluster.
Average Link: distance between clusters is the average distance between all points in between clusters. Clusters with the minimum of these distances merge into a bigger cluster.
Complete Link: distance between clusters is the distance between those two points that are farthest away from each other. Two clusters with the minimum of these distances merge into a bigger cluster.
Minimum spanning tree (MST): In a connected graph without any cycle, a spanning tree is a subset tree in which all vertex are still connected. If edges have weight, MST is a span tree in which the edges have the minimum weight. MST may not be unique.

to visualize the outcome of the hierarchical clustering we often use “Dendrogram”.

import scipy.cluster.hierarchy as hierarchal_clustering
import matplotlib.pyplot as plt

data=[ [0],[1],[2],[3],[4] ]
data_dict = {'A':0,'B':1,'C':2,'D':3,'E':4}
data_labels = [data_dict.keys()[data_dict.values().index(i)] for i in range(0, len(data_dict))]

method='complete'
plt.title(method)
hierarchal_clusteringlink = hierarchal_clustering.linkage(data,method,metric)
dendro = hierarchal_clustering.dendrogram(hierarchal_clusteringlink,labels=data_labels)
plt.show()

method='centroid'
plt.title(method)
hierarchal_clusteringlink = hierarchal_clustering.linkage(data,method,metric)
dendro = hierarchal_clustering.dendrogram(hierarchal_clusteringlink,labels=data_labels)
plt.show()

import scipy.cluster.hierarchy as hierarchal_clustering

import matplotlib.pyplot as plt

data=[ [0],[1],[2],[3],[4] ]

data_dict = {'A':0,'B':1,'C':2,'D':3,'E':4}

data_labels = [data_dict.keys()[data_dict.values().index(i)] for i in range(0, len(data_dict))]

method='complete'

plt.title(method)

hierarchal_clusteringlink = hierarchal_clustering.linkage(data,method,metric)

dendro = hierarchal_clustering.dendrogram(hierarchal_clusteringlink,labels=data_labels)

plt.show()

method='centroid'

plt.title(method)

hierarchal_clusteringlink = hierarchal_clustering.linkage(data,method,metric)

dendro = hierarchal_clustering.dendrogram(hierarchal_clusteringlink,labels=data_labels)

plt.show()

The following graph represents the following matrix :

A,B,C,D,E
A,0,1,2,2,3
B,1,0,2,4,3
C,2,2,0,1,5
D,2,4,1,0,3
E,3,3,5,3,0

A,B,C,D,E

A,0,1,2,2,3

B,1,0,2,4,3

C,2,2,0,1,5

D,2,4,1,0,3

E,3,3,5,3,0

Minimum spanning tree of the graph.

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import csv
import numpy as np

def make_label_dict(labels):
    l = {}
    for i, label in enumerate(labels):
        l[i] = label
    return l

input_data = pd.read_csv('data/adjacency_matrix.csv', index_col=0)
#print input_data.head
print input_data.values
G = nx.DiGraph(input_data.values)

with open('data/adjacency_matrix.csv', 'r') as f:
    d_reader = csv.DictReader(f)
    headers = d_reader.fieldnames

#print headers

labels=make_label_dict(headers)
#print labels

edge_labels = dict( ((u, v), d["weight"]) for u, v, d in G.edges(data=True) )
pos = nx.spring_layout(G)
nx.draw(G, pos)
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
nx.draw(G,pos,node_size=500, labels=labels, with_labels=True)
plt.show()


from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
X = csr_matrix(np.array(input_data.values) )
Tcsr = minimum_spanning_tree(X)
MST= Tcsr.toarray().astype(int)
print MST


G_MST = nx.DiGraph(MST)
edge_labels = dict( ((u, v), d["weight"]) for u, v, d in G_MST.edges(data=True) )
pos = nx.spring_layout(G_MST)
nx.draw(G_MST, pos)
nx.draw_networkx_edge_labels(G_MST, pos, edge_labels=edge_labels)
nx.draw(G_MST,pos,node_size=500, labels=labels, with_labels=True)
plt.show()

import pandas as pd

import networkx as nx

import matplotlib.pyplot as plt

import csv

import numpy as np

def make_label_dict(labels):

l = {}

for i, label in enumerate(labels):

l[i] = label

return l

input_data = pd.read_csv('data/adjacency_matrix.csv', index_col=0)

#print input_data.head

print input_data.values

G = nx.DiGraph(input_data.values)

with open('data/adjacency_matrix.csv', 'r') as f:

d_reader = csv.DictReader(f)

headers = d_reader.fieldnames

#print headers

labels=make_label_dict(headers)

#print labels

edge_labels = dict( ((u, v), d["weight"]) for u, v, d in G.edges(data=True) )

pos = nx.spring_layout(G)

nx.draw(G, pos)

nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

nx.draw(G,pos,node_size=500, labels=labels, with_labels=True)

plt.show()

from scipy.sparse import csr_matrix

from scipy.sparse.csgraph import minimum_spanning_tree

X = csr_matrix(np.array(input_data.values) )

Tcsr = minimum_spanning_tree(X)

MST= Tcsr.toarray().astype(int)

print MST

G_MST = nx.DiGraph(MST)

edge_labels = dict( ((u, v), d["weight"]) for u, v, d in G_MST.edges(data=True) )

pos = nx.spring_layout(G_MST)

nx.draw(G_MST, pos)

nx.draw_networkx_edge_labels(G_MST, pos, edge_labels=edge_labels)

nx.draw(G_MST,pos,node_size=500, labels=labels, with_labels=True)

plt.show()

0 0 votes

Article Rating

Cookie	Duration	Description
cookielawinfo-checbox-analytics	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Analytics".
cookielawinfo-checbox-functional	11 months	The cookie is set by GDPR cookie consent to record the user consent for the cookies in the category "Functional".
cookielawinfo-checbox-others	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Other.
cookielawinfo-checkbox-necessary	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookies is used to store the user consent for the cookies in the category "Necessary".
cookielawinfo-checkbox-performance	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Performance".
viewed_cookie_policy	11 months	The cookie is set by the GDPR Cookie Consent plugin and is used to store whether or not user has consented to the use of cookies. It does not store any personal data.