Skip to content

Commit

Permalink
Added Netflix titles screencast
Browse files Browse the repository at this point in the history
  • Loading branch information
dgrtwo committed Apr 20, 2021
1 parent ff2ae06 commit 01947eb
Showing 1 changed file with 283 additions and 0 deletions.
283 changes: 283 additions & 0 deletions 2021_04_20_netflix_titles.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
---
title: "Netflix Titles"
date: 2021-04-20
output: html_output
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(tidytuesdayR)
library(scales)
theme_set(theme_light())
```

# Load the weekly Data

Dowload the weekly data and make available in the `tt` object.

```{r Load}
library(lubridate)
tt <- tt_load("2021-04-20")
netflix_titles <- tt$netflix_titles %>%
separate(duration, c("duration", "duration_units"), sep = " ", convert = TRUE) %>%
mutate(date_added = mdy(date_added),
year_added = year(date_added)) %>%
mutate(mature = rating %in% c("TV-MA", "R", "NC-17"))
```

```{r}
netflix_titles %>%
ggplot(aes(release_year, fill = type)) +
geom_histogram(binwidth = 5) +
facet_wrap(~ type, ncol = 1, scales = "free_y")
netflix_titles %>%
#count(year = 2 * (release_year %/% 2), type) %>%
count(year = release_year, type) %>%
group_by(type) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(year, percent, color = type)) +
geom_line()
```

```{r}
netflix_titles %>%
filter(type == "Movie") %>%
mutate(decade = 10 * (release_year %/% 10)) %>%
ggplot(aes(decade, duration, group = decade)) +
geom_boxplot()
```

```{r}
summarize_titles <- function(tbl) {
tbl %>%
summarize(n = n(),
median_duration = median(duration),
median_year = median(release_year)) %>%
arrange(desc(n))
}
```


```{r}
netflix_titles %>%
count(rating, sort = TRUE)
netflix_titles %>%
separate_rows(listed_in, sep = ", ") %>%
group_by(type, genre = listed_in) %>%
summarize_titles() %>%
filter(type == "Movie") %>%
filter(genre != "Movies") %>%
mutate(genre = fct_reorder(genre, median_duration)) %>%
ggplot(aes(median_duration, genre)) +
geom_col()
```

```{r}
head(netflix_titles$date_added)
```

Date added

```{r}
netflix_titles %>%
filter(!is.na(date_added)) %>%
arrange(date_added) %>%
select(type, title, date_added)
netflix_titles %>%
filter(!is.na(date_added)) %>%
count(year_added, type) %>%
ggplot(aes(year_added, n, fill = type)) +
geom_area()
netflix_titles %>%
filter(year_added >= 2015) %>%
filter(!is.na(date_added), !is.na(rating)) %>%
mutate(rating = fct_lump(rating, 5)) %>%
ungroup() %>%
count(type, year_added, rating) %>%
group_by(type, year_added) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(year_added, percent, fill = rating)) +
geom_area() +
facet_wrap(~ type)
```

```{r}
netflix_titles %>%
filter(!is.na(country)) %>%
count(country = fct_lump(country, 20),
type,
sort = TRUE) %>%
mutate(country = fct_reorder(country, n)) %>%
ggplot(aes(n, country, fill = type)) +
geom_col()
netflix_titles %>%
filter(!is.na(country)) %>%
filter(type == "Movie") %>%
group_by(country) %>%
summarize_titles()
```

```{r}
netflix_titles %>%
filter(rating %in% c("R", "TV-MA")) %>%
count(country, sort = TRUE)
netflix_titles %>%
filter(!is.na(rating), !is.na(country)) %>%
group_by(type, country = fct_lump(country, 9)) %>%
summarize(n_mature = sum(rating %in% c("R", "TV-MA", "NC-17")),
n = n(),
.groups = "drop") %>%
mutate(pct_mature = n_mature / n,
conf_low = qbeta(.025, n_mature + .5, n - n_mature + .5),
conf_high = qbeta(.975, n_mature + .5, n - n_mature + .5)) %>%
ggplot(aes(pct_mature, country, color = type)) +
geom_point(aes(size = n)) +
geom_errorbar(aes(xmin = conf_low, xmax = conf_high), width = .1) +
scale_x_continuous(labels = percent) +
expand_limits(x = 0) +
labs(x = "% of titles that are R/TV-MA")
```

```{r}
library(tidytext)
library(snakecase)
library(tidylo)
words_unnested <- netflix_titles %>%
unnest_tokens(word, description) %>%
anti_join(stop_words, by = "word")
words_unnested %>%
count(type, word, sort = TRUE) %>%
mutate(type = to_snake_case(type)) %>%
spread(type, n, fill = 0) %>%
mutate(total = movie + tv_show) %>%
arrange(desc(total)) %>%
head(100) %>%
ggplot(aes(movie, tv_show)) +
geom_point() +
geom_text(aes(label = word) , vjust = 1, hjust = 1) +
scale_x_log10() +
scale_y_log10()
words_unnested %>%
count(type, word) %>%
bind_log_odds(type, word, n) %>%
arrange(desc(log_odds_weighted)) %>%
group_by(type) %>%
top_n(10, log_odds_weighted) %>%
ungroup() %>%
mutate(word = fct_reorder(word, log_odds_weighted)) %>%
ggplot(aes(log_odds_weighted, word)) +
geom_col() +
facet_wrap(~ type, scales = "free_y")
```

```{r}
library(widyr)
library(tidygraph)
library(ggraph)
set.seed(2021)
words_unnested %>%
distinct(type, title, word) %>%
add_count(word, name = "word_total") %>%
filter(word_total >= 40) %>%
pairwise_cor(word, title, sort = TRUE) %>%
filter(correlation >= .1) %>%
igraph::graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = correlation)) +
geom_node_point() +
geom_node_text(aes(label = name),
repel = TRUE) +
theme(legend.position = "none")
```

```{r}
word_genre_log_odds <- words_unnested %>%
distinct(type, title, word, genre = listed_in) %>%
add_count(word, name = "word_total") %>%
filter(word_total >= 25) %>%
separate_rows(genre, sep = ", ") %>%
filter(fct_lump(genre, 9) != "Other") %>%
count(genre, word) %>%
bind_log_odds(genre, word, n)
```

```{r}
word_genre_log_odds %>%
group_by(genre) %>%
top_n(10, log_odds_weighted) %>%
ungroup() %>%
mutate(word = reorder_within(word, log_odds_weighted, genre)) %>%
ggplot(aes(log_odds_weighted, word, fill = genre)) +
geom_col() +
facet_wrap(~ genre, scales = "free_y") +
scale_y_reordered() +
theme(legend.position = "none") +
labs(x = "Log-odds of word's specificity to genre",
y = "")
```

## Lasso regression

```{r}
word_ratings <- words_unnested %>%
count(type, title, rating, word) %>%
filter(!is.na(rating)) %>%
add_count(word, name = "word_total") %>%
filter(word_total >= 30)
```

```{r}
library(glmnet)
library(broom)
other_features <- netflix_titles %>%
select(title, director, cast, genre = listed_in, country) %>%
gather(feature_type, feature, director, cast, genre, country) %>%
filter(!is.na(feature)) %>%
separate_rows(feature, sep = ", ") %>%
mutate(feature_type = str_to_title(feature_type)) %>%
unite(feature, feature_type, feature, sep = ": ") %>%
add_count(feature, name = "feature_count") %>%
filter(feature_count >= 10)
feature_matrix <- word_ratings %>%
mutate(feature = paste("Description:", word)) %>%
bind_rows(other_features) %>%
cast_sparse(title, feature)
y <- netflix_titles$mature[match(rownames(feature_matrix), netflix_titles$title)]
mod <- cv.glmnet(feature_matrix, y, family = "binomial")
plot(mod)
mod$glmnet.fit %>%
tidy() %>%
separate(term, c("feature_type", "feature"), sep = ": ") %>%
filter(lambda == mod$lambda.1se) %>%
top_n(60, abs(estimate)) %>%
mutate(feature = fct_reorder(feature, estimate)) %>%
ggplot(aes(estimate, feature, fill = feature_type)) +
geom_col() +
labs(x = "Coefficient: does this make the title more likely to be TV-MA/R?",
y = "",
fill = "Feature Type")
```

0 comments on commit 01947eb

Please sign in to comment.