pres.tex

\documentclass{beamer}

\usetheme{Ilmenau}
\usepackage[all]{xy}

\title{Syntactic and Phonological Distance in British Dialects}
\author{Nathan Sanders}
\date{\today}

\begin{document}

\begin{frame}
  \frametitle{Welcome to Jones!}

  \begin{itemize}
  \item You will join an elite, but rapidly dwindling group of brawling immortals. Try to keep your head.
  %\item The Norwegians may be watching you. Act casual and pretend to be an American Tourist.
  \item Try O'Caml, a Very Nice Functional Language!
  \item You will kill your father and marry your mother.
  \item  Androids from the future will appear and fight over your destiny.
  \item  The key to defeating the French is to avoid looting.
  %\item  Only the pure of heart can remove the sword from the stone.
  \item  Never trust a monkey.
  \item  Family tragedy and a drug overdose will let you see the future.
  \item  USE LINUX, THE FREE OPERATING SYSTEM BY LINUS TORVALDS.
%   \item  You will become independently wealthy and spend your time
%     exploring ruinous ruins.
  \item You will age rapidly and become a wizard's servant.
  \item It's a wonderful life!
  \end{itemize}
\end{frame}

\frame{\titlepage}

\section[Outline]{}
\frame{\tableofcontents}
\begin{frame}
  Compare syntactic and phonological distance measures from
  dialectometry to see if the two agree, using British English dialect data.
  
  \vspace{2cm}

  {\tt http://jones.ling.indiana.edu/\~{}ncsander/dialect/pres.pdf}\\
  {\tt http://\ldots{}/sanders-qual\_phonology-syntax-distance.pdf}
\end{frame}
\section{Background}
\begin{frame}
  \frametitle{Dialectology}
  \begin{definition}
    Dialectology looks for boundaries between languages.
  \end{definition}
  \begin{definition}
    A dialect is mutually intelligible with other dialects in the same
    language. It is not a very precise term.
  \end{definition}
 % Dialect does double duty in linguistics and sociology. A better
 % linguistic approach would be to define language as a tree-like
 % concept with sub-languages related to each other.

 % Dialectometry uses agglomerative techniques and clever math to do
%  dialectology
\end{frame}
\begin{frame}
  \frametitle{Before: Dialectology}
  \includegraphics[width=0.9\textwidth]{total-county}
  % omg, they say "soft drink" in Shelbyville??! what idiots. Also my
  % grandma says 'pop'.
  % Sorry, this tells you NOTHING new. It only reflects existing
  % boundaries. No* contribution to new knowledge of boundaries.
\end{frame}
\begin{frame}
  % Statistics can tell you if something is significant, but only if
  % you have multiple samples.
  % Once you have multiple samples, you can start telling HOW
  % different something is.
  % Now you get the old SOLID boundaries but also gradients.
  % And you know exactly how important they are.
  \frametitle{After: Dialectometry}
\begin{columns}
\column[c]{0.5\textwidth}

  \begin{definition}
    Dialectometry uses aggregate data techniques to find differences
    between language areas.
  \end{definition}
  Map from the LAMSAS, generated by RuG-L04
\column[c]{0.5\textwidth}
  \includegraphics[width=0.9\textwidth]{demo1}
\end{columns}
\end{frame}
\begin{frame}
\begin{columns}
\column[c]{0.5\textwidth}
Pro:\\
  The result is significant and gradient.
\column[c]{0.5\textwidth}
Con: \\
  Methods are harder to develop.
\end{columns}
\vspace{2cm}
Hence: Most dialectometry has worked with phonology.
% good methods have existed for a long time in phonology
% not so in syntax
\end{frame}

\section{Methods}
\subsection{Phonology}
\begin{frame}
\frametitle{Levenshtein Distance}
%     This makes for a pretty equation OR tail-recursive, extraordinarily
%     efficient code implemented entirely in Scheme. You get the
%     equation today.
\begin{definition}
For each character $s_i \in S$ and $t_j \in T$ for any string $S$ and $T$,
\begin{equation}
  levenshtein(s_i,t_j) = min \left(
  \begin{array}{l}
   ins(s_i)+levenshtein(s_{i-1},t_j), \\
 del(t_j)+levenshtein(s_i,t_{j-1}), \\
 sub(s_i,t_j)+levenshtein(s_{i-1},t_{j-1})
   \end{array} \right)
   \label{levequation}
\end{equation}
The total distance between $S$ and $T$ is $levenshtein(S_{|S|},T_{|T|})$.
\end{definition}
\end{frame}
\begin{frame}
%   Don't worry about the details. The important part is how $ins$,
%   $sub$ and $del$ are defined. Here is one of the simplest
%   definitions:
\frametitle{Levenshtein Distance}
\begin{definition}
\begin{equation}
\begin{array}{l}
   ins(t_j) = 1 \\
   del(s_i) = 1 \\
   sub(s_i,t_j) = \left\{
     \begin{array}{ll}
       0 & \textrm{if $s_i=t_j$} \\
       2 & \textrm{otherwise}
     \end{array} \right.

   \end{array}
\end{equation}
\end{definition}
\end{frame}
\begin{frame}
  \frametitle{Levenshtein distance}
  \begin{example}
    \begin{tabular}{l|r}
    $_\uparrow$ART &   {\tt start=0} \\
\pause
    C$_\uparrow$ART & {\tt 0+ins(C)=1} \\
\pause
    CA$_\uparrow$RT & {\tt 1+sub(A,A)=1} \\
\pause
    CA$_\uparrow$T & {\tt 1+del(R)=2} \\
\pause
    CAT$_\uparrow$ & {\tt 2+sub(T,T)=2} \\
    \end{tabular}
  \end{example}

\end{frame}
\begin{frame}
\frametitle{Levenshtein distance}
\begin{figure}
\caption{The distance table for ``ART'' to ``CAT''}

\begin{center}
\begin{tabular}{c|c|c|c|c}
%\hline
  &   & A & R & T \\
\hline
  & $\mathbf{0}$ & 1 & 2 & 3 \\
\hline
C & $\mathbf{1}$ & 2 & 3 & 4 \\
\hline
A & 2 & $\mathbf{1}$ & $\mathbf{2}$ & 3 \\
\hline
T & 3 & 2 & 3 & $\mathbf{2}$
% \hline
\end{tabular}

\end{center}

\label{art2cattable}
\end{figure}

\end{frame}
\subsection{Syntax}
\begin{frame}
\frametitle{Syntax}
  A permutation test over $R$ over leaf-ancestor paths.
\end{frame}
\begin{frame}
\frametitle{R}
\begin{definition}
\begin{equation}
R = \Sigma_i |c_{ai} - c_{bi}|
\label{rmeasure}
\end{equation}
\end{definition}
\noindent{}Given two corpora $a$ and $b$, $c_a$ and $c_b$ are the type
counts. $i$ ranges over all types, so $c_{ai}$ and $c_{bi}$ are the
type counts for type $i$.
\end{frame}
\begin{frame}
\frametitle{R}
  \hspace{-1.0cm} {\tt def R(a,b):} \\
  \hspace{-0.6cm}{\tt sum(abs(ca-cb) for ca,cb in zip(a.values(), b.values()))}
  \begin{example}
    \[a=\{\textrm{NP-Det}:12, \textrm{NP-N}:10, \textrm{VP-V}:5\}\]
    \[b=\{\textrm{NP-Det}:3, \textrm{NP-N}:12, \textrm{VP-V}:20\}\]
    \[R = \{|12-3| + |10-12| + |5-20|\} = 27 \]
  \end{example}
\end{frame}
\begin{frame}
  \frametitle{Leaf-Ancestor Paths}
The parse tree
\begin{columns}
\column[c]{0.5\textwidth}
\[\xymatrix{
  &&\textrm{S} \ar@{-}[dl] \ar@{-}[dr] &&\\
  &\textrm{NP} \ar@{-}[d] \ar@{-}[dl] &&\textrm{VP} \ar@{-}[d]\\
  \textrm{Det} \ar@{-}[d] & \textrm{N} \ar@{-}[d] && \textrm{V} \ar@{-}[d] \\
\textrm{the}& \textrm{dog} && \textrm{barks}\\}
\]
\column[c]{0.5\textwidth}
creates the following leaf-ancestor paths:
\begin{itemize}
\item S-NP-Det-The
\item S-NP-N-dog
\item S-VP-V-barks
\end{itemize}
\end{columns}
\end{frame}
\begin{frame}
  \frametitle{Permutation Test}
  \begin{enumerate}
  \item Find $d = R(sample(a),sample(b))$.
  \item Now, at least 20 times:
  \item $shuffled = shuffle(a,b)$
  \item $shuffle_a = sample(shuffled)$
  \item $shuffle_b = sample(shuffled)$
  \item Find $d_{shuffle} = R(shuffle_a,shuffle_b)$
  \item Is $d_{shuffle} < d$ ? It should be; shuffling should destroy any significant differences.
  \item If $d_{shuffle} < d$ more than 95\% of the time, $d$ is significant.
  \end{enumerate}
\end{frame}
\section{Experiment}
\subsection{Corpora}
\begin{frame}
  \frametitle{Corpora}
  \begin{columns}
\column[c]{0.5\textwidth}
  \includegraphics[width=1.0\textwidth]{GB_GOR98_A4}
\column[c]{0.5\textwidth}
  \begin{itemize}
  \item Phonology: Survey of English Dialects (SED)
  \item Syntax: International Corpus of English, Great Britain
    (ICE-GB)
  \end{itemize}
    \end{columns}
\end{frame}
\begin{frame}
  \frametitle{SED}
  \begin{itemize}
  \item Classic dialectology corpus: collected from interviews in the
    1950s, a couple of hundred items.
  \item One entry per county, NORM sources.
  \item The SED tries to capture the older uses of the language.
  \end{itemize}
\end{frame}
\begin{frame}
  \frametitle{ICE-GB}
  \begin{itemize}
  \item Conversation (and writing), syntactically annotated.
  \item Collected in the 1990s, various sources with place-of-birth noted.
  \end{itemize}
\end{frame}
\subsection{Parameters}
\begin{frame}
\frametitle{Phonology}
  Determine average value for insertion and deletion--this should be
  half the average substitution cost for arbitrary segments.
\end{frame}
\begin{frame}
  \frametitle{Syntax}
  \begin{tabular}{l|cr}
  measure& $R$ & $R^2$ \\
  sample size& 500 & 1000 \\
  input processing& trigram & leaf-ancestor path \\
  \end{tabular}
\end{frame}
\begin{frame}
  \frametitle{Validity}
  To find out whether parameter settings are valid.
  \begin{enumerate}
  \item Assume that Scotland and London are different.
  \item Measure the difference between them according to the current parameter
    settings.
  \item  Shuffle
  \item Split
  \item Test
  \item Repeat
  \end{enumerate}
\end{frame}
\section{Results}
\begin{frame}
  \frametitle{Results}
  \begin{columns}
    \column[c]{0.5\textwidth}
  \includegraphics[width=1.2\textwidth]{sed_dendrogram}
    \column[c]{0.5\textwidth}
  \includegraphics[width=1.2\textwidth]{ice_dendrogram}
\end{columns}
\end{frame}
\begin{frame}
  \frametitle{Results}
  \begin{columns}
    \column[c]{0.5\textwidth}
  \includegraphics[width=1.2\textwidth]{sed_dendrogram}
    \column[c]{0.5\textwidth}
  \includegraphics[width=0.8\textwidth]{GB_GOR98_A4}
\end{columns}
\end{frame}
\begin{frame}
  \frametitle{Results}
  \begin{columns}
    \column[c]{0.5\textwidth}
  \includegraphics[width=1.2\textwidth]{ice_dendrogram}
    \column[c]{0.5\textwidth}
  \includegraphics[width=0.8\textwidth]{GB_GOR98_A4}
\end{columns}
\end{frame}
\begin{frame}
  \frametitle{Results}
  There is no significant correlation between the methods, or between
  R and corpus size.
\end{frame}
\begin{frame}
  \frametitle{Why not?}
  \begin{itemize}
  \item Problem with distance measures
  \item Problem with annotators
  \item Dialect changes from 1950 - 1990
  \item Place of birth may be a bad indicator of dialect
  \item Phonology and syntax may disagree
  \end{itemize}
\end{frame}
\section{Practicalities}
\subsection{Useful Code}
\begin{frame}
  \frametitle{\tt iceread}
  \begin{itemize}
  \item I wrote some Python code that reads the ICE trees and allows you
    to sort and group them by speaker properties.
  \item It's available via subversion on Jones.
  \end{itemize}
\end{frame}
\begin{frame}
\begin{example}
    {\tt iceread.read('sspeakers.csv', 12)}
\end{example}
  \begin{itemize}
    \item Log in to jones and find a good place to check out a
      project.
  \item {\tt svn co /Volumes/Data/svnrep/britishdialects dialect}
% as a bonus this gives you the code for what I've talked
% about today
    \item {\tt cd dialect/ice}
    \item {\tt python2.5} % sorry, doesn't work on older versions of
    \item {\tt >>> import iceread}
    \item {\tt >>> speakers = iceread.read('sspeakers-region.csv', 12,
        delimiter=',')}
      % Python
  \end{itemize} % group speakers by column 12
\end{frame}
\begin{frame}
  \frametitle{Demo}
\end{frame}
% \begin{frame}[fragile]
%   % so.import the library and then the basic interface is
%   % iceread.read, which gives you the speakers from the corpus that
%   % are in the text file passed.
%   % The second parameter is the column to group by.
%   % The returned value is a dictionary, one key for each group. Each
%   % group is a list of parsed sentences (draw picture)
%   % http://jones.ling.indiana.edu/wiki/IceCorpus
% \begin{verbatim}
% >>> import iceread
% >>> speakers = iceread.read('sspeakers.txt', 12)
% >>> speakers.keys()
% ['Northeast', 'Southeast', 'Scotland', 'EastMidlands', 'India', 'Yorkshire', 'WestMidlands', 'China', 'Northwest', 'Wales', 'East', 'London', 'Spain', 'Southwest']
% >>> from pprint import pprint
% >>> pprint(speakers["A"][0])
% [('PU,CL',
%   [('SU,NP', ['NPHD,PRON']),
%    ('VB,VP', ['OP,AUX', 'MVB,V']),
%    ('OD,NP',
%     [('NPPR,AJP', [('AJPR,AVP', ['AVHD,ADV']), 'AJHD,ADJ']),
%      'NPHD,N',
%      ('NPPO,PP', ['P,PREP', ('PC,NP', ['NPHD,N', 'PAUSE,PAUSE'])])]),
%    ('A,PP', ['P,PREP', ('PC,NP', ['NPHD,N'])]),
%    ('A,PP',
%     ['P,PREP',
%      ('PC,NP', [('DT,DTP', ['DTCE,ART']), 'NPHD,N', 'PAUSE,PAUSE'])])]),
%  '']
% \end{verbatim} % there is also a low-level interface you can use to
%                % read the corpus in more flexibly, using either
%                % corpus, with a custom grouping of speakers (eg more
%                % than one property). Or you can even read a corpus
%                % file yourself and then uses sentences to extract the
%                % sentences from it. This gives you all the sentences
%                % in a single file sorted by speaker.
% \end{frame}

\subsection{Lessons}
\begin{frame}
  \frametitle{Things I learned}
  \begin{itemize}
  \item Always, {\it always} make a repeatable build script for your experiment.
  %\item But {\tt bash} isn't the language to write that script in. %Try Python.
  \item Be {\tt nice}. Watch your program execute the first time using
    {\tt top}.
  \item Python's GC can be unreliable.
  \item Don't be afraid to rewrite your prototype in a faster
    language. It's not as hard as you think.
  \item Use a reliable source control system.
  \end{itemize}
\end{frame}
\begin{frame}
  \frametitle{A Mystery}
  \begin{itemize}
  \item For the permutation test in syntax, the only method that came
    out significant was repeated sampling with replacement.
  \item However, shuffling the corpora together and splitting them to
    be the same size should have worked just as well.
  \item eg shuffle corpora $a$ and $b$ together into $shuffled$, then
    split it into $shuffle_a$ and $shuffle_b$. $shuffle_a$ has the
    same size as $a$, $shuffle_b$ has the same size as $b$.
  \item But this always gives $R(shuffle_a,shuffle_b)$ a slightly
    higher value than $R(a,b)$.
  \end{itemize}
\end{frame}
\end{document}
%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: 
Read order:
Phono- and tono-genesis:
I think this is boring, but may we should read at least one paper about it. I vote for
Svantesson based purely on the fact that Ken can also throw in his comments from
his paper.

--Near mergers-- and incomplete neutralisation:
well THIS is an explosive topic. Let me see.
I like
Nycs, J. 2005. The dynamics of near-meger in accomodation. (sounds
like it mentions both perception and produciton)
Yu, A C L forthcoming might be interesting, if only to rip on. It
seems to me to be difficult to construct a believable exemplar model.
-- Incomplete neutralisation --
Warner, N, A Jongman, J Sereno, R. Kemps mentions both  production and
perception. And of course one of Bob's papers might be good because we
could bug him about his ideas.
-- Frequency effects --
Jurafsky, D,A Bell and C Girand. 2002. seems better if it talks about
both production and perception. But of course the other one probably
does too.
-- Acquisition --
This is already ordered by interestingness
Bohn, O.S.; Flege, J.E. (1990). Perception and Production of a New Vowel Category by Adult Second Language Learners. In Leather, J. \& James, A. (Eds.) New Sounds 90: Proceedings of the 1990 Amsterdam Symposium on the Acquisition of Second- 
Language Speech.Amsterdam: University of Amsterdam Press. pp. 37-56 
Bradlow, A.R., D.B. Pisoni, R. Akahane-Yamada, & Y. Tohkura (1997).  Training 
Japanese listeners to identify English /r/ and /l/, IV: Some effects of perceptual 
learning on speech production.  Journal of the Acoustical Society of America, 101: 
2299 - 2310.   
Dorman, M. F., Ausberger, C., Bailey, P., & Raphael, L. J. (1978). The relationship 
between speech perception and production in children who subsititute /t/ or /d/ for 
/s/. Journal of the Acoustical Society of America, 64(S1), p. S51 
  THIS one should be under dialect, it sounds so interesting
Flege, J. E., Munro, M. J., MacKay, I. R. A. (1995). Factors affecting strength of 
perceived foreign accent in a second language. Journal of the Acoustical Society of America, 97, 3125--3134 

- especially L2 -
-- General issues --
I thought we already talked about this.
-- Dialect --
I guess that
Brasseur 2006
Clopper and Pisoni
Markham 1999
Niedzielski 2001

-- Accommodation --
Pardo 2006 is still the best
or what about
Smiljanic and Bradlow
Giles was pretty boring and would be expensive to copy