-
Notifications
You must be signed in to change notification settings - Fork 2
/
defence-pres.tex
513 lines (494 loc) · 16.3 KB
/
defence-pres.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
\documentclass{beamer}
\usetheme{Ilmenau}
\usepackage[all]{xy}
\title{A Statistical Method for Syntactic Dialectometry}
\author{Nathan Sanders}
\date{\today}
\AtBeginSection[] % Do nothing for \section*
{
\begin{frame}<beamer>
\frametitle{Outline}
\tableofcontents[currentsection]
\end{frame}
}
\begin{document}
\frame{\titlepage}
\section[Outline]{}
\frame{\tableofcontents}
\begin{frame}
This dissertation establishes the reliability and utility of a
statistical measure for syntactic dialectometry.
\end{frame}
\section{Introduction}
\begin{frame}
\frametitle{Dialectology}
\begin{columns}
\column[c]{0.5\textwidth}
\begin{definition}
Dialectology is the study of linguistic variation.
\end{definition}
% One variable at a time, use your in-built intuition to combine
% variables. This requires good intuition and lots of it.
\column[c]{0.5\textwidth}
\includegraphics[scale=0.22]{dialektboka-karta8}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Dialectometry}
\begin{columns}
\column[c]{0.5\textwidth}
\begin{definition}
Dialectometry is a subfield that studies variation quantitatively,
using methods from statistics and information theory.
\end{definition}
% Combine variables first, use less intuition.
% This requires good combining methods, which is harder than it
% sounds.
% NOTE: You can still manually extract variables by hand. It's a
% separate issue. That's how my work differs from Spruit's.
\column[r]{0.5\textwidth}
\includegraphics[scale=0.25]{Sverigekarta-cluster-5-1000}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Abstract Distance Measure Model}
\[\xymatrix@C=1pc{
\textrm{Corpus} \ar@{>}[d]|{} &
S = s_o,s_1,\ldots
\ar@{>}[dd]|{f}
&&
T = t_o,t_1,\ldots
\ar@{>}[dd]|{f}
\\
\textrm{Decomposition}\ar@{>}[dd] &&&\\
&
*{\begin{array}{c}
\left[ + f_o, +f_1 \ldots \right], \\
\left[ - f_o, +f_1 \ldots \right], \\
\ldots \\ \end{array}}
\ar@{>}[dr]
&&
*{\begin{array}{c}
\left[ + f_o, -f_1 \ldots \right], \\
\left[ + f_o, -f_1 \ldots \right], \\
\ldots \\ \end{array}}
\ar@{>}[dl] \\
\textrm{Combination} && \textrm{Distance} & \\
} \]
\end{frame}
% % These two are probably too much--more appropriate for a 50 minute
% % talk
% \begin{frame}
% Feature decomposition
% \end{frame}
% \begin{frame}
% Distance measure
% \end{frame}
\begin{frame}
\frametitle{Phonological versus Syntactic Dialectology}
Unlike phonology, in syntax:
\begin{itemize}
% because it's easier to introspect on syntax, perhaps because of the
% influence of a writing system, which also leads to standardization
\item There are fewer syntactic dialect differences.
% : a larger corpus is needed
\item There is no accepted measure of syntactic distance.
% : because corpora have to be larger, but there is no good way to
% annotate larger corpora collected this way.
\item There is no accepted way to extract linguistic information
from syntactic corpora.
% : without manual annotation, mainstream syntax cannot be used
% (and in fact NO corpus of mainstream annotated sentences exists,
% much less a dialect one.)
% ((Idiot syntacticians don't believe that corpora is useful, even
% small non-repeating ones. ANYWAY. This is the reason that
% empiricists don't believe that minimalism can possibly be
% capturing actual language, because the theory has never been
% tested on actual language, just miniature tests.))
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Syntax and Dialectology}
Goebl and Spruit
\begin{itemize}
\item They propose small numbers of features extracted by hand and a
complex distance measure.
\item This works with smaller corpora, including dialect surveys.
\item Gaps in linguistic knowledge can bias feature selection.
%In other words, you have to do dialectology before you can do dialectometry
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Syntax and Dialectology}
Nerbonne \& Wiersma and Sanders
\begin{itemize}
\item They propose large numbers of automatically extracted features
and a simple distance measure.
\item This requires larger corpora.
\item But they can be automatically annotated.
\item Until now, there has been no thorough examination of this
method's ability to reproduce existing dialect knowledge.
\end{itemize}
\end{frame}
\section{Questions}
\begin{frame}
\frametitle{Question 1}
Do the results of this statistical measure for syntactic dialectometry
agree with dialectology?
%% \includegraphics[scale=0.25]{Sverigekarta-Landskap-consensus-5-1000}
% oops I need a map of the traditional areas here. I don't have one
% of those in PDF form I think
%% \includegraphics[scale=0.25]{Sverigekarta-Landskap-consensus-5-1000}
\end{frame}
\begin{frame}
\frametitle{Question 2}
What parameter variations produce the best agreement with
dialectology?
\end{frame}
\begin{frame}
\frametitle{Question 3}
Do the results of this statistical measure for syntax agree with the
results for a phonological measure of distance on the same data?
\end{frame}
\section{Methods}
\begin{frame}
\frametitle{Feature Set}
Capture syntactic information by representing individual units of
information.
\[\xymatrix@C=1pc{
\textrm{Corpus} \ar@{>}[d]|{} &
S = s_o,s_1,\ldots
\ar@{>}[dd]|{f}
&&
T = t_o,t_1,\ldots
\ar@{>}[dd]|{f}
\\
\textrm{Decomposition} &&&\\
&
*{\begin{array}{c}
\left[ + f_o, +f_1 \ldots \right], \\
\left[ - f_o, +f_1 \ldots \right], \\
\ldots \\ \end{array}}
&&
*{\begin{array}{c}
\left[ + f_o, -f_1 \ldots \right], \\
\left[ + f_o, -f_1 \ldots \right], \\
\ldots \\ \end{array}} \\
} \]
\end{frame}
\begin{frame}
\frametitle{Feature Sets}
\begin{tabular}{c|c}
Leaf-Ancestor Paths & Nested Structure \\ \hline
Leaf-Head Paths & \\
Leaf-Head Paths, based on Timbl training &Long-distance context \\
Leaf-Arc Paths & \\ \hline
Phrase Structure Rule & Internal Structure \\
Phrase Structure Rule with Grandparent & \\ \hline
Trigrams & Context and order \\ \hline
Unigrams & Baseline \\ \hline
Combined & \\
\end{tabular}
\end{frame}
\begin{frame}
\frametitle{Distance Measure}
Combine two sets of features into a single number.
\[\xymatrix@C=1pc{
\textrm{Decomposition}\ar@{>}[dd] &&&\\
&
*{\begin{array}{c}
\left[ + f_o, +f_1 \ldots \right], \\
\left[ - f_o, +f_1 \ldots \right], \\
\ldots \\ \end{array}}
\ar@{>}[dr]
&&
*{\begin{array}{c}
\left[ + f_o, -f_1 \ldots \right], \\
\left[ + f_o, -f_1 \ldots \right], \\
\ldots \\ \end{array}}
\ar@{>}[dl] \\
\textrm{Combination} && \textrm{Distance} & \\
} \]
\end{frame}
\begin{frame}
\frametitle{Measures}
\begin{tabular}{c|c}
$R$ & $\Sigma_i |a_i - b_i|$ \\
$R^2$ & $\Sigma_i (a_i - b_i)^2$ \\ \hline
Kullback-Leibler divergence & $\sum_i {a_i \log\frac{a_i}{b_i} + b_i \log\frac{b_i}{a_i}}$ \\
Jensen-Shannon divergence & $\sum_i {a_i \log\frac{b_i}{\bar{c_i}} + a_i \log\frac{b_i}{\bar{c_i}}}$ \\ \hline
Cosine similarity & $\cos(a,b)$ \\
\end{tabular}
\end{frame}
\begin{frame}
\frametitle{Sampling / Iterations}
\begin{itemize}
\item 1000 sentences with replacement
\item All sentences
\end{itemize}
\begin{itemize}
\item 1 normalization iteration
\item 5 normalization iterations
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Input Processing: Corpora}
\begin{columns}
\column[c]{0.5\textwidth}
The Swediasyn is a dialect corpus collected in 2000 from more than
100 villages through Sweden and Swedish-speaking Finland. Four
speakers from each village were interviewed. The interviews were
later transcribed, 30 so far.
\column[c]{0.5\textwidth}
Talbanken is a mixed newspaper / speech corpus collected in the
1970s. It is used for training automatic annotators.
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Input Processing: Annotators}
\begin{itemize}
\item Tags`n'Trigrams for part-of-speech tagging
\item The Berkeley parser for phrase-structure parsing
\item MaltParser for dependency parsing
\end{itemize}
\end{frame}
% \begin{frame}
% \frametitle{Phonology}
% Determine average value for insertion and deletion--this should be
% half the average substitution cost for arbitrary segments.
% \end{frame}
\begin{frame}
\frametitle{Output Processing: Significance}
\begin{enumerate}
\item Find $d = R(sample(a),sample(b))$.
\item Now, at least 20 times:
\item $shuffled = shuffle(a,b)$
\item $shuffle_a = sample(shuffled)$
\item $shuffle_b = sample(shuffled)$
\item Find $d_{shuffle} = R(shuffle_a,shuffle_b)$
\item Is $d_{shuffle} < d$ ? It should be; shuffling should destroy any significant differences.
\item If $d_{shuffle} < d$ more than 95\% of the time, $d$ is significant.
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{Output Processing: Correlation with Travel Distance}
\begin{itemize}
\item Dialects generally correlate with distance
\item Swedish dialects have no sharp boundaries, meaning that
correlation should be even better.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Output Processing: Cluster Analysis}
Produce hierarchical clusters which:
\begin{itemize}
\item Put each site in a unique cluster (hard clustering)
\item Consensus trees improve stability by using only clusters that
occur in the majority of parameter settings
\item Composite cluster maps display clusters as boundaries, similar
to isoglosses.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Output Processing: Multi-dimensional scaling}
Produce 3D clusters which:
\begin{itemize}
\item Do not put sites in unique clusters (soft clustering)
\item Maps high dimensional dissimilarity space to 3 dimensional distance
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Output Processing: Feature Ranking}
\begin{itemize}
\item Compares the rate of individual features between regions.
\item Features are characteristic of one or the other of a region
pair.
\end{itemize}
\end{frame}
\section{Results}
\begin{frame}
\frametitle{Organization}
Results are grouped by sample size and normalization.
\begin{tabular}{c|c|c|}
& 1 iteration & 5 iterations \\ \hline
1000 sentences & 1-1000 & 5-1000 \\ \hline
Full site & 1-full & 5-full \\ \hline
\end{tabular}
\end{frame}
\begin{frame}
\frametitle{Significance}
\begin{center}
\begin{tabular}{c|c|c|}
& 1 & 5 \\ \hline
1000 & $\circ$ & $\circ$\\ \hline
Full & $\circ$ & $\times$\\ \hline
\end{tabular}
\end{center}
\begin{itemize}
\item The highest number of significant distances was 1-1000.
\item However, decreasing significance generally means more informative
results;
\item As classifiers become more sensitive, they deal with increased
noise, which means less significance.
% This trend holds across feature sets, such that trigrams give
% the best trade-off between significance and sensitivity.
% (Unfortunately. This is a pretty low-end tradeoff.)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Correlation with Travel Distance}
\begin{center}
\begin{tabular}{c|c|c|}
& 1 & 5 \\ \hline
1000 & 0.24 & 0.22\\ \hline
Full & 0.29 & 0.18 \\ \hline
\end{tabular}
\end{center}
\begin{itemize}
% \item Correlation with travel and geographic distance is low but significant.
% \item Most correlations are 0.2 to 0.3, with a high of 0.37.
\item This is lower than predicted by the ``boundary free'' view of
Swedish dialects. % which is the usual, well accepted, view
\item Correlation with travel distance is slightly better than with
geographic distance.
% probably this points to a shortcoming in the distance measure
% rather than new information about Swedish dialects.
% also, given the size-geographical distance correlation (0.45**),
% it all may be epiphenomenal and we learn nothing from looking at
% correlations.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Clusters and Consensus Trees}
\begin{itemize}
% \item Consensus trees are the easiest way to interpret hierarchical clusters.
\item Of the previous groups, the 5-1000 setting
retains the most detailed consensus tree.
\item Where the others have clusters, they agree with 5-1000's.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Clusters and Consensus Trees}
\begin{columns} \column[c]{0.5\textwidth}
\begin{center}
\includegraphics[scale=0.3]{Sverigekarta-Landskap-consensus-5-1000}
\end{center} \column[c]{0.5\textwidth}
\begin{itemize}
\item When projected on a map of Sweden, this tree reproduces the
major dialect areas.
\item (North, East, West, South)
% \item South divides into multiple clusters.
% \item But North and East are part of the same cluster.
% maybe an effect of cities? Cities are big these days
\end{itemize}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Composite Cluster Maps}
\begin{columns}
\column[c]{0.5\textwidth}
\includegraphics[scale=0.3]{Sverigekarta-cluster-5-1000}
\column[c]{0.5\textwidth}
The composite cluster maps reproduce dialectology's
\begin{itemize}
\item Weak boundaries between regions.
\item North-to-south gradient.
\end{itemize}
Notably, the southern boundary is stronger than the rest, and
Tors\aa{}s/J\"amshog/\"Ossj\"o are further isolated within this area.
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Multi-Dimensional Scaling}
\begin{itemize}
\item MDS maps are harder to analyze because they cannot be combined.
\item However, most MDS maps show the same pattern as the other two
mapping methods.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Features}
\begin{itemize}
\item Overall, feature extraction was biased by the two
normalization methods.
\item Without the overuse normalization, the results tend to be
features that occur in {\it every} dialect; it only highlights
differences.
\item With the overuse normalization, the results tend to be quite
noisy.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Features}
Nonetheless, some interesting features appeared:
\begin{itemize}
\item The southern clusters had odd positioning of adverbs,
conjunctions and ends of sentences.
\item This may point to non-standard placement of adverbs.
\item The tightest southern cluster showed possible use of double modals.
\end{itemize}
\end{frame}
\section{Discussion}
\begin{frame}
\frametitle{Comparison to Dialectology}
Good agreement at the levels of
\begin{itemize}
\item regions (consensus tree maps, MDS maps)
\item boundaries (composite cluster maps)
\item distances (composite cluster maps)
\end{itemize}
But not for specific features.
\end{frame}
\begin{frame}
\frametitle{Comparison to Dialectology Features}
\begin{itemize}
\item About a third of the results were positive, a third negative
and a third inconclusive.
\item However, none were very strong.
\item None could be verified as significant, given current methods.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Comparison to Phonological Dialectometry}
\begin{itemize}
\item Agreement with Leinonen's variable maps is good.
\item All syntactic boundaries have an analogue in a boundary for
some phonological variable.
\item Results are preliminary; correlation is needed to know whether
the similarities are significant.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Comparison to Syntactic Dialectometry}
\begin{itemize}
\item This dissertation shows agreement with dialectology.
\item Previous work found significant differences but did not
produce distances, regions or boundaries that agreed with
dialectology.
\item The sites used here are smaller than in previous work.
\item This indicates that corpora specifically collected for dialect
work still have an advantage.
\item The parameter variations analyzed here guide future research.
\end{itemize}
\end{frame}
\section{Conclusion}
\begin{frame}
\frametitle{Future Work}
\begin{itemize}
\item Correlation with phonological dialectometry.
\item Rapid analysis of the rest of Swedia and Nodalida.
\item Improved feature normalization.
\item Significance testing for feature-by-feature comparisons.
\item Improved automatic annotation to reduce feature noise.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Conclusion}
This dissertation establishes the reliability and utility of a
statistical measure for syntactic dialectometry.
\end{frame}
\end{document}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End: