COMPSCI-589/src/Lecture15/lecture.tex at master · mlds-lab/COMPSCI-589 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
\documentclass[serif,xcolor=pdftex,dvipsnames,table,hyperref={bookmarks=false,breaklinks}]{beamer}

\input{../config.tex}

\settitlecard{15}{Hierarchical Clustering}

\begin{document}

\maketitlepage

\section{Introduction}
\subsection{Foo}

\begin{frame}[t]{Views on Machine Learning}

\iconbox{1}{}{../Figures/mitchell.jpg}{\textbf{Mitchell (1997):} ``A computer
program is said to learn from experience E with respect to some class of tasks
T
and performance measure P, if its performance at tasks in T, as measured by P,
improves with experience E.''\\[12pt]  Substitute ``training data D'' for
``experience E.''}

\end{frame}

\begin{frame}[t]{Machine Learning Tasks}
 \centering
 \includegraphics[width=4in]{../Figures/learning_problems.png}
\end{frame}

\begin{frame}[t]{The Classification Task}

\begin{block}{Definition: The Classification Task}
Given a feature vector $\mbf{x}\in\mathbb{R}^D$ that describes an object that
belongs to one of $C$ classes from the set $\mathcal{Y}$, predict which class
the object belongs to.
\end{block}

\end{frame}

\begin{frame}[t]{The Clustering Task}

\begin{block}{Definition: The Clustering Task}
Given a collection of data cases $\mbf{x}_i\in\mathbb{R}^D$, partition the
data cases into groups such that the data cases within each partition are
more similar to each other than they are to data cases in other partitions.
\end{block}

\pause
\center
\includegraphics[width=3in]{../Figures/clustering_example.png}
\end{frame}

\begin{frame}[t]{Examples: Market Segmentation}
\center
\includegraphics[width=4.5in]{../Figures/market_segmentation.jpg}
\end{frame}

\begin{frame}[t]{Examples: Community Detection}
\center
\includegraphics[width=3.5in]{../Figures/community_detection.jpg}
\end{frame}

\begin{frame}[t]{Examples: Gene Expression}
\center
\includegraphics[width=3.5in]{../Figures/gene_clustering.jpg}
\end{frame}

\begin{frame}[t]{Examples: Phylogenetic Trees}
\center
\includegraphics[width=4.5in]{../Figures/phylogeny.jpg}
\end{frame}

\begin{frame}[t]{Examples: Super Pixels}
\center
\includegraphics[width=4in]{../Figures/super_pixels.png}
\end{frame}


\section{Exhaustive Clustering}
\subsection{foo}

\begin{frame}[t]{Defining a Clustering}

\begin{itemize}
\item Suppose we have $N$ data cases $\mathcal{D}=\{\mbf{x}_i\}_{i=1:N}$.

 \pause \item A clustering of the $N$ cases into $K$ clusters is a partitioning
 of $\mathcal{D}$ into $K$ mutually disjoint subsets
 $\mathcal{C}=\{C_1,...,C_K\}$ such that
 $C_1 \cup ... \cup C_K = \mathcal{D}$.

\end{itemize}
\end{frame}

\begin{frame}[t]{Exhaustive Clustering}
 \begin{itemize}
 \item Suppose we have a function $f(\mathcal{C})$ that takes a partitioning
 $\mathcal{C}$ of the data set $D$ and returns a score with lower scores
 indicating better clusterings.

 \pause\item The optimal clustering according to $f$ is simply given by
 $$\arg\min_{\mathcal{C}} \;\;f(\mathcal{C})$$

 \pause\item \textbf{Question:} What is the complexity of exhaustive clustering?
\end{itemize}
\end{frame}

\begin{frame}[t]{Number of Clusterings}

\begin{itemize}

 \item The total number of clusterings of a set of $N$ elements is the
Bell number $B_N$ where $B_0=1$ and $B_{n+1} = \sum_{k=0}^n {n \choose k} B_k$.

\pause \item The first few Bell numbers are: 1, 1, 2, 5, 15, 52, 203, 877,
4140, 21147, 115975, 678570, 4213597, 27644437, 190899322, ...

\pause \item The complexity of exhaustive clustering scales with $B_N$ and is
thus computationally totally intractable for general scoring functions.

\pause\item We will need either approximation algorithms or scoring functions
with special properties.
\end{itemize}

\end{frame}

\section{Hierarchical Clustering}
\subsection{foo}

\begin{frame}[t]{Hierarchical Agglomerative Clustering}

\begin{itemize}
\item Hierarchical Clustering methods are a family of greedy tree-based
clustering methods.

\pause\item Hierarchical Agglomerative Clustering (HAC)  is the most popular
member of this family.

\pause\item It begins with all data cases assigned to their own clusters, and
then greedily and recursively merges the pair of clusters that is optimal with
respect to a given criteria.

\end{itemize}

\end{frame}

\begin{frame}[t]{Distance and Linkage Functions}

\begin{itemize}
\item Like KNN, HAC need to be supplied with a function for computing the
distance between two data cases. This is often taken to be Euclidean
distance, but could be any distance function.

\pause\item To merge clusters, HAC also needs what is called a linkage function
for measuring the distance between clusters.

\pause\item Linkage functions can differ
significantly in their computational complexity and the clusterings they
produce.
\end{itemize}

\end{frame}

\begin{frame}[t]{Examples of Linkage Functions}
\center
\includegraphics[width=4in]{../Figures/linkage.png}
\end{frame}


\begin{frame}[t]{The Hierarchical Agglomerative Clustering Algorithm}
\center
\includegraphics[width=4in]{../Figures/hac_algorithm.png}
\end{frame}

\begin{frame}[t]{Example: Data}
\center
\includegraphics[width=3in]{../Figures/hac_example_data.png}
\end{frame}

\begin{frame}[t]{Example: Dendrograms}
\center
\includegraphics[width=4in]{../Figures/hac_example.png}
\end{frame}

\begin{frame}[t]{Issues}

\begin{itemize}
\item We need to have a good notion of similarity for the results of cluster
analysis to be meaningful at all.

\pause\item As with KNN, pre-processing like re-scaling/normalizing features can
completely change the results.

\pause\item Further, we need to slect between the different linkage functions.

\pause\item We need some way to determine the ``right'' number of clusters to
focus on. We want to cluster on salient differences between data cases, not
noise.

\pause \item This procedure is not able to nicely handle noise observations
that are different from each other and from the rest of the data that do
belong to valid clusters.

\pause \item All of these issues mean we need to be cautious in interpreting
the results of clustering. It should  be the starting point for an exploratory
data analysis, not the end point.

\end{itemize}
\end{frame}


\end{document}