-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathds3_web.tex
461 lines (429 loc) · 14 KB
/
ds3_web.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
\documentclass[compress, black]{beamer}
\setbeamercolor{normal text}{fg=black}
\beamertemplatesolidbackgroundcolor{white}
\usecolortheme[named=black]{structure}
\usepackage{caption}
\captionsetup{labelformat=empty}
\setbeamertemplate{navigation symbols}{}
%\usefonttheme{structurebold}
\usepackage[scaled]{helvet}
\renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif
\usepackage[T1]{fontenc}
\usepackage{setspace}
%\usepackage{beamerthemesplit}
\usepackage{graphics}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{verbatim}
\usepackage{amssymb}
\usepackage{wrapfig}
\usefonttheme[onlymath]{serif}
\usepackage{cmbright}
\def\labelitemi{\textemdash}
\setbeamertemplate{frametitle}{
\begin{centering}
\vskip15pt
\insertframetitle
\par
\end{centering}
}
\title[DS]{Databases}
\author[Sood]{Gaurav~Sood}
\large
\date[2015]{Spring 2015}
\subject{LearnDS}
\begin{document}
\newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}}
\newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}}
\newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}}
\newenvironment{large_enum}{
\Large
\begin{itemize}
\setlength{\itemsep}{7pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
}{\end{itemize}}
\begin{comment}
setwd(paste0(basedir, "github/data-science/ds3/"))
tools::texi2dvi("ds3_web.tex", pdf=TRUE,clean=TRUE)
setwd(basedir)
\end{comment}
\frame
{
\titlepage
}
\frame{
\frametitle{Data}
\begin{large_enum}
\item[--]<2->Bits on non-volatile storage e.g. magnetic media, SSD
\item[--]<3->Not just bits on hard disk, but organized bits (a data model)
\begin{enumerate}
\item[-]<4-> Nested folders $\sim$ tree
\item[-]<5-> Rows and columns $\sim$ table
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Data Model}
\begin{large_enum}
\item[--]<1->Structure
\begin{enumerate}
\item[--]<2-> Rows and columns (Relational Database)
\item[--]<3-> Key-value pairs (NoSQL systems)
\item[--]<4-> Sequence of bytes (Files)
\end{enumerate}
\item[--]<5->Constraints
\begin{enumerate}
\item[--]<6-> All columns must be of same length
\item[--]<7-> Each value in this column has to be X
\item[--]<8-> Child cannot have two parents (tree) (before Gmail Labels)
\end{enumerate}
\item[--]<9->(Efficiently Supported) Operations
\begin{enumerate}
\item[--]<10-> Look up value of key x.
\item[--]<11-> Find me rows where column `lastname' is Jordan.
\item[--]<12-> Get next $n$ bytes.
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Questions to consider}
\begin{large_enum}
\item[--]<1-> What are the features of the data? How do we plan to use the data?
\item[--]<2-> How often are the data updated?
\item[--]<3-> What queries are efficiently supported?
\item[--]<4-> How are the data stored on disk -- the physical data model?
\item[--]<5-> How are data organized internally -- the abstract data model?
\end{large_enum}
}
\frame{
\frametitle{What are databases?}
\begin{large_enum}
\item[--]<1-> Organized collection of data.
\item[--]<2-> Organized to afford efficient retrieval (aim can vary).
\item[--]<3-> Data + Metadata about structure and organization.
\item[--]<4-> Data that are self-describing $\sim$ have a schema.
\item[--]<5-> Generally accessed through a Database Management System.
\end{large_enum}
}
\frame{
\frametitle{What do Database Systems (DBMS) do?}
\begin{large_enum}
\item[--]<1-> Provide \alert{efficient}, \alert{reliable}, \alert{convenient} and \alert{safe}, \alert{multi-user} storage of and access to \alert{massive} amounts of data.
\begin{enumerate}
\item[--]<2-> \textbf{Massive:} Terabytes. Handle data that reside outside memory.
\item[--]<3-> \textbf{Safe:} Robust to power outages, node failures etc.
\item[--]<4-> \textbf{Multi-user:} Concurrency control. Not one user/ one user per data item. File system concurrency.
\item[--]<5-> \textbf{Convenient:} High level query languages; declarative: what you want, not describe algorithm
\item[--]<6-> \textbf{Efficient:} Performace, performance, performance.
\item[--]<7-> \textbf{Reliable:} 99.99999\% up time.
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Key concepts}
\begin{large_enum}
\item[--]<1->Data model\\\normalsize
XML, relational model, etc.
\item[--]<2->Schema versus data distinction\\\normalsize
Types versus variables\\
Schema (types, structure), actual data\\
\item[--]<3->Data definition language (DDL)\\\normalsize
For setting up schema
\item[--]<4->Data manipulation or query language (DML)\\\normalsize
For querying and modifying the database
\end{large_enum}
}
\frame{
\Large Some Data Models
}
\frame{
\frametitle{XML}
\begin{large_enum}
\item[--]<1->Extensive Markup Language\\\normalsize
Similar to HTML\\
Tags describe data rather than how to format
\item[--]<2->Standard for data exchange and representation
\item[--]<3->XML
\begin{enumerate}
\item[--]<4-> Tagged elements
\item[--]<5-> Nesting of elements
\item[--]<6-> Attributes
\item[--]<7-> Not all elements have attributes
\end{enumerate}
\end{large_enum}
}
\begin{frame}
\frametitle<1>{Plain Text}
\frametitle<2>{XML}
\only<1>{
{\tt
June 5, 2006 \\
Floor Statement of Senator Barack Obama Federal Marriage Amendment \\
I agree with most Americans, with Democrats and Republicans, with Vice President Cheney, with over 2,000 religious leaders of all different beliefs, that decisions about marriage, as they always have, should be left to the states.
}
}
\only<2>{
{\tt
$<$DOC$>$\\
$<$DOCNO$>$obama-2006-06-05-01$<\//$DOCNO$>$\\
$<$TEXT$>$\\
I agree with most Americans, with Democrats and Republicans, with Vice President Cheney, with over 2,000 religious leaders of all different beliefs, that decisions about marriage, as they always have, should be left to the states.\\
$<\//$TEXT$>$\\
$<\//$DOC$>$
}
}
\end{frame}
\begin{frame}
\frametitle{JSON}
{\tt
\{u'category': u'Government official', u'username': u'RepJohnLewis', u'about': u'Official Congressional page for Rep. John Lewis', ... u'name': u'John Lewis', u'hometown': u'Troy, AL', ... u'website': u'johnlewis.house.gov', u'phone': u'(404) 659 - 0116', u'birthday': u'02/21/1940', u'likes': 62974, ...\}
}
\end{frame}
\frame{
\frametitle{Relational Model}
\begin{large_enum}
\item[--]<1->RDBMS-- Codd 1970
\item[--]<2->Database: set of named relations (or tables)
\item[--]<3->Each relation has a set of named attributes (or columns)
\item[--]<4->Each attribute (column) has a type (or domain)\\\normalsize
There is also concept of an enumerated domain
\item[--]<5->Each row (= tuple)
\item[--]<6->Schema-- structure, name, type of attributes
\item[--]<7->\alert{Everything is a table}
\end{large_enum}
}
\frame{
\frametitle{Relational Model}
\begin{large_enum}
\item[--]<1->Relationships are implicit; no pointers
\begin{enumerate}
\item[--]<2-> Physical Data independence
\item[--]<3-> Compare to Network Data Model etc.
\end{enumerate}
\item[--]<4->Example:
\begin{enumerate}
\item[--]<5-> \lbrack course, Student id \rbrack, \lbrack student id, student name\rbrack
\item[--]<6-> Just a shared id
\item[--]<7-> Really bad for performance, all student names for a course, look up all the ids
\item[--]<8-> Hierarchical can be quicker if all students in one course
\item[--]<9-> But to look up all courses student X has taken, just as good (bad)
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Relational Vs. XML}
\begin{table}[h]
\begin{tabular}{l|l|l}
& Relational & XML \\
Structrure & Tables & Hierarchical \\
Schema & Fixed in advance & Flexible, self-describing\\
Queries & Easy & Querying not as easy\\
Ordering & Unordered & Implied order (document) \\
\end{tabular}
\end{table}
}
\frame{
\frametitle{Algebra of Tables}
\begin{large_enum}
\item[--]<1->In algebra, concept of algebraic optimization -- Rules to simply calculation
\item[--]<2->SQL uses relational algebra
\item[--]<3->All RDBMS optimize relational algebra `calculations'
\item[--]<4->Cost-based optimization
\item[--]<5->\alert{We can leave this optimization to program}
\end{large_enum}
}
\frame{
\frametitle{Algebra of Tables}
\begin{large_enum}
\item[--]<1->Basic Relational Algebra
\begin{enumerate}
\item[--]<2->Union, Intersection, Difference\\
\item[--]<3->Selection, $\sigma$\\
\item[--]<4->Projection, $\Pi$\\
\item[--]<5->Join, $\bowtie$
\end{enumerate}
\item[--]<6->Extended Relational Algebra:
\begin{enumerate}
\item[--]<7->Duplicate elimination, $d$\\
\item[--]<8->Grouping and aggregation, $g$\\
\item[--]<9->Sorting, $t$
\end{enumerate}
\item[--]<10->Sets Vs Bags
\begin{enumerate}
\item[--]<11->Sets: \{a,b,c\} (RA, Papers)
\item[--]<12->Bags: \{a,a,b,c\} (SQL)
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{Select Rows}
\begin{large_enum}
\item[--]<1->Tuples (rows) that satisfy a particular criteria
\item[--]<1->$\sigma_\text{condition} \text{Relation}$\\
Pick Certain Rows
\item[--]<2->$\sigma_{GPA > 3} \text{Student}$\\
$\sigma_{GPA > 3 \wedge LastName=='Smith'} \text{Student}$\\
\item[--]<3->In SQL, selection using `Where' (and project using `Select')
\end{large_enum}
}
\frame{
\frametitle{Project}
\begin{large_enum}
\item[--]<1->$\Pi_\text{A1, A2} \text{Relation}$\\
Pick Certain Columns
\item[--]<2->$\Pi_\text{SSN, GPA} \text{Student}$\\
\item[--]<3->Both Select and Project\\
$\Pi_\text{SSN, GPA} (\sigma_{GPA > 3} \text{Student})$\\
$\Pi_\text{A1, A2} \text{Expression}$\\
$\sigma_\text{condition} \text{Expression}$\\
\end{large_enum}
}
\frame{
\frametitle{SQL}
Select Col1, Col2 (or *)\\
FROM R1\\
Where Condition
}
\frame{
\frametitle{Cross-Product}
\begin{large_enum}
\item[--]<1->Cartesian Product
\item[--]<2->R1 X R2; Every tuple in R1 with each tuple in R2
\item[--]<3->Size of ouput: R1 * R2
\item[--]<4->SELECT *\\
FROM R1 CROSS JOIN R2;
\item[--]<5->SELECT *\\
FROM R1, R2;
\end{large_enum}
}
\frame{
\frametitle{Natural Join}
\begin{large_enum}
\item[--]<1->Means Equi-Join
\item[--]<2->For every record in R1, find a tuple in R2 that satisfies a particular condition
\item[--]<3->Select * \\
From R1, R2\\
Where R1.A = R2.A
\item[--]<4->Or, do the cross-product and then filter
\item[--]<5->Select *\\
From R1 Join R2\\
On R1.A = R2.B\\
\end{large_enum}
}
\frame{
\frametitle{Theta Join}
\begin{large_enum}
\item[--]<1->Join without the equality condition
\item[--]<2->Equi-Join is a special case of theta join
\item[--]<3->Find ALL weather stations within 5 miles of centroid of a zip code
\item[--]<4->Select Distinct WStations\\
From Wstations h, ZipCode s\\
Where distance(h.location, s.location) < 5\\
\item[--]<4->`distance' is a user-defined function
\item[--]<5->Band Joins or Range Joins
\end{large_enum}
}
\frame{
\frametitle{Theta Join}
\begin{large_enum}
\item[--]<1->Outer Join
\item[--]<2->All tuples in R1 and pad out other values with NULLs
\item[--]<3->Left outer join\\
Select *\\
From R1 LEFT OUTER Join R2\\
On R1.A = R2.B
\item[--]<4->Right outer join
\item[--]<5->Full outer join
\end{large_enum}
}
\frame{
\frametitle{Union}
\begin{large_enum}
\item[--]<1->R1 U R2\\
Removes duplicates by default
\item[--]<2->Select * FROM R1\\
UNION\\
Select * FROM R2
\item[--]<3->If you wanted duplicates, you want to do `UNION ALL'
\end{large_enum}
}
\frame{
\frametitle{Difference}
\begin{large_enum}
\item[--]<1->R1 - R2
\item[--]<2->Select * From R1\\
Except\\
Select * From R2
\item[--]<3->Looks up all tuples in R1 and takes out tuples that it also sees in R2\\\normalsize
R2 can have other tuples\\
Everything in R1, remove things that also appear in R2
\end{large_enum}
}
\frame{
\frametitle{Intersection}
\begin{large_enum}
\item[--]<1->Not a fundamental operator
\item[--]<2->R1 Intersect R2 = R1 - (R1 - R2)
\item[--]<3->Can also express it as join
\end{large_enum}
}
\frame{
\frametitle{(Virtual) Views}
\begin{large_enum}
\item[--]<1->Defining and Using (Virtual) Views\\\normalsize
Three-level vision of database\\
Physical layer, Conceptual (abstraction of the data, relations), Logical layer (further abstraction, )\\
Real applications use lots and lots of views
\item[--]<2->Benefit of Views\\\normalsize
Hide some data from some users (Authorization etc.)\\
Make certain queries easier, more natural\\
Modularity of database access
\item[--]<3->Views\\\normalsize
Query over relations\\
View V = ViewQuery(R1, R2,... RN)\\
Schema of V is schema of query result
\end{large_enum}
}
\frame{
\frametitle{Query over views}
\begin{large_enum}
\item[--]<1->V can be thought of as a table
\item[--]<2->Q is actually rewritten in terms of R1, R2, ....RN
\item[--]<3->R1 can also be view
\item[--]<4->CREATE View VName (A1, ..., AN) As Query
\item[--]<5->Just define the view and use them (System optimizes in terms of `base tables')
\item[--]<6->View contents not stored
\item[--]<7->A convenient view is a join
\end{large_enum}
}
\frame{
\frametitle{Modifying Views}
\begin{large_enum}
\item[--]<1->We cannot insert, update, delete as V isn't stored
\item[--]<2->To alter V, we have to modify the base tables
\end{large_enum}
}
\frame{
\frametitle{Materialized Views}
\begin{large_enum}
\item[--]<1->Creates a physical table
\item[--]<2->But \ldots
\begin{enumerate}
\item[--]<3->Tables can be very large
\item[--]<4->What happens when modifications happen to the base tables
\item[--]<5->Modification to base data can invalidate the view
\end{enumerate}
\end{large_enum}
}
\frame{
\frametitle{SQLite: Data Types}
\begin{large_enum}
\item[--]<1->NULL -- The value is a NULL value
\item[--]<2->INTEGER -- a signed integer
\item[--]<3->REAL -- a floating point value
\item[--]<4->TEXT -- a text string
\item[--]<5->BLOB -- a blob of data
\end{large_enum}
}
\end{document}