-
Notifications
You must be signed in to change notification settings - Fork 145
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
283 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,283 @@ | ||
--- | ||
title: "Netflix Titles" | ||
date: 2021-04-20 | ||
output: html_output | ||
--- | ||
|
||
```{r setup, include=FALSE} | ||
knitr::opts_chunk$set(echo = TRUE) | ||
library(tidyverse) | ||
library(tidytuesdayR) | ||
library(scales) | ||
theme_set(theme_light()) | ||
``` | ||
|
||
# Load the weekly Data | ||
|
||
Dowload the weekly data and make available in the `tt` object. | ||
|
||
```{r Load} | ||
library(lubridate) | ||
tt <- tt_load("2021-04-20") | ||
netflix_titles <- tt$netflix_titles %>% | ||
separate(duration, c("duration", "duration_units"), sep = " ", convert = TRUE) %>% | ||
mutate(date_added = mdy(date_added), | ||
year_added = year(date_added)) %>% | ||
mutate(mature = rating %in% c("TV-MA", "R", "NC-17")) | ||
``` | ||
|
||
```{r} | ||
netflix_titles %>% | ||
ggplot(aes(release_year, fill = type)) + | ||
geom_histogram(binwidth = 5) + | ||
facet_wrap(~ type, ncol = 1, scales = "free_y") | ||
netflix_titles %>% | ||
#count(year = 2 * (release_year %/% 2), type) %>% | ||
count(year = release_year, type) %>% | ||
group_by(type) %>% | ||
mutate(percent = n / sum(n)) %>% | ||
ggplot(aes(year, percent, color = type)) + | ||
geom_line() | ||
``` | ||
|
||
```{r} | ||
netflix_titles %>% | ||
filter(type == "Movie") %>% | ||
mutate(decade = 10 * (release_year %/% 10)) %>% | ||
ggplot(aes(decade, duration, group = decade)) + | ||
geom_boxplot() | ||
``` | ||
|
||
```{r} | ||
summarize_titles <- function(tbl) { | ||
tbl %>% | ||
summarize(n = n(), | ||
median_duration = median(duration), | ||
median_year = median(release_year)) %>% | ||
arrange(desc(n)) | ||
} | ||
``` | ||
|
||
|
||
```{r} | ||
netflix_titles %>% | ||
count(rating, sort = TRUE) | ||
netflix_titles %>% | ||
separate_rows(listed_in, sep = ", ") %>% | ||
group_by(type, genre = listed_in) %>% | ||
summarize_titles() %>% | ||
filter(type == "Movie") %>% | ||
filter(genre != "Movies") %>% | ||
mutate(genre = fct_reorder(genre, median_duration)) %>% | ||
ggplot(aes(median_duration, genre)) + | ||
geom_col() | ||
``` | ||
|
||
```{r} | ||
head(netflix_titles$date_added) | ||
``` | ||
|
||
Date added | ||
|
||
```{r} | ||
netflix_titles %>% | ||
filter(!is.na(date_added)) %>% | ||
arrange(date_added) %>% | ||
select(type, title, date_added) | ||
netflix_titles %>% | ||
filter(!is.na(date_added)) %>% | ||
count(year_added, type) %>% | ||
ggplot(aes(year_added, n, fill = type)) + | ||
geom_area() | ||
netflix_titles %>% | ||
filter(year_added >= 2015) %>% | ||
filter(!is.na(date_added), !is.na(rating)) %>% | ||
mutate(rating = fct_lump(rating, 5)) %>% | ||
ungroup() %>% | ||
count(type, year_added, rating) %>% | ||
group_by(type, year_added) %>% | ||
mutate(percent = n / sum(n)) %>% | ||
ggplot(aes(year_added, percent, fill = rating)) + | ||
geom_area() + | ||
facet_wrap(~ type) | ||
``` | ||
|
||
```{r} | ||
netflix_titles %>% | ||
filter(!is.na(country)) %>% | ||
count(country = fct_lump(country, 20), | ||
type, | ||
sort = TRUE) %>% | ||
mutate(country = fct_reorder(country, n)) %>% | ||
ggplot(aes(n, country, fill = type)) + | ||
geom_col() | ||
netflix_titles %>% | ||
filter(!is.na(country)) %>% | ||
filter(type == "Movie") %>% | ||
group_by(country) %>% | ||
summarize_titles() | ||
``` | ||
|
||
```{r} | ||
netflix_titles %>% | ||
filter(rating %in% c("R", "TV-MA")) %>% | ||
count(country, sort = TRUE) | ||
netflix_titles %>% | ||
filter(!is.na(rating), !is.na(country)) %>% | ||
group_by(type, country = fct_lump(country, 9)) %>% | ||
summarize(n_mature = sum(rating %in% c("R", "TV-MA", "NC-17")), | ||
n = n(), | ||
.groups = "drop") %>% | ||
mutate(pct_mature = n_mature / n, | ||
conf_low = qbeta(.025, n_mature + .5, n - n_mature + .5), | ||
conf_high = qbeta(.975, n_mature + .5, n - n_mature + .5)) %>% | ||
ggplot(aes(pct_mature, country, color = type)) + | ||
geom_point(aes(size = n)) + | ||
geom_errorbar(aes(xmin = conf_low, xmax = conf_high), width = .1) + | ||
scale_x_continuous(labels = percent) + | ||
expand_limits(x = 0) + | ||
labs(x = "% of titles that are R/TV-MA") | ||
``` | ||
|
||
```{r} | ||
library(tidytext) | ||
library(snakecase) | ||
library(tidylo) | ||
words_unnested <- netflix_titles %>% | ||
unnest_tokens(word, description) %>% | ||
anti_join(stop_words, by = "word") | ||
words_unnested %>% | ||
count(type, word, sort = TRUE) %>% | ||
mutate(type = to_snake_case(type)) %>% | ||
spread(type, n, fill = 0) %>% | ||
mutate(total = movie + tv_show) %>% | ||
arrange(desc(total)) %>% | ||
head(100) %>% | ||
ggplot(aes(movie, tv_show)) + | ||
geom_point() + | ||
geom_text(aes(label = word) , vjust = 1, hjust = 1) + | ||
scale_x_log10() + | ||
scale_y_log10() | ||
words_unnested %>% | ||
count(type, word) %>% | ||
bind_log_odds(type, word, n) %>% | ||
arrange(desc(log_odds_weighted)) %>% | ||
group_by(type) %>% | ||
top_n(10, log_odds_weighted) %>% | ||
ungroup() %>% | ||
mutate(word = fct_reorder(word, log_odds_weighted)) %>% | ||
ggplot(aes(log_odds_weighted, word)) + | ||
geom_col() + | ||
facet_wrap(~ type, scales = "free_y") | ||
``` | ||
|
||
```{r} | ||
library(widyr) | ||
library(tidygraph) | ||
library(ggraph) | ||
set.seed(2021) | ||
words_unnested %>% | ||
distinct(type, title, word) %>% | ||
add_count(word, name = "word_total") %>% | ||
filter(word_total >= 40) %>% | ||
pairwise_cor(word, title, sort = TRUE) %>% | ||
filter(correlation >= .1) %>% | ||
igraph::graph_from_data_frame() %>% | ||
ggraph(layout = "fr") + | ||
geom_edge_link(aes(alpha = correlation)) + | ||
geom_node_point() + | ||
geom_node_text(aes(label = name), | ||
repel = TRUE) + | ||
theme(legend.position = "none") | ||
``` | ||
|
||
```{r} | ||
word_genre_log_odds <- words_unnested %>% | ||
distinct(type, title, word, genre = listed_in) %>% | ||
add_count(word, name = "word_total") %>% | ||
filter(word_total >= 25) %>% | ||
separate_rows(genre, sep = ", ") %>% | ||
filter(fct_lump(genre, 9) != "Other") %>% | ||
count(genre, word) %>% | ||
bind_log_odds(genre, word, n) | ||
``` | ||
|
||
```{r} | ||
word_genre_log_odds %>% | ||
group_by(genre) %>% | ||
top_n(10, log_odds_weighted) %>% | ||
ungroup() %>% | ||
mutate(word = reorder_within(word, log_odds_weighted, genre)) %>% | ||
ggplot(aes(log_odds_weighted, word, fill = genre)) + | ||
geom_col() + | ||
facet_wrap(~ genre, scales = "free_y") + | ||
scale_y_reordered() + | ||
theme(legend.position = "none") + | ||
labs(x = "Log-odds of word's specificity to genre", | ||
y = "") | ||
``` | ||
|
||
## Lasso regression | ||
|
||
```{r} | ||
word_ratings <- words_unnested %>% | ||
count(type, title, rating, word) %>% | ||
filter(!is.na(rating)) %>% | ||
add_count(word, name = "word_total") %>% | ||
filter(word_total >= 30) | ||
``` | ||
|
||
```{r} | ||
library(glmnet) | ||
library(broom) | ||
other_features <- netflix_titles %>% | ||
select(title, director, cast, genre = listed_in, country) %>% | ||
gather(feature_type, feature, director, cast, genre, country) %>% | ||
filter(!is.na(feature)) %>% | ||
separate_rows(feature, sep = ", ") %>% | ||
mutate(feature_type = str_to_title(feature_type)) %>% | ||
unite(feature, feature_type, feature, sep = ": ") %>% | ||
add_count(feature, name = "feature_count") %>% | ||
filter(feature_count >= 10) | ||
feature_matrix <- word_ratings %>% | ||
mutate(feature = paste("Description:", word)) %>% | ||
bind_rows(other_features) %>% | ||
cast_sparse(title, feature) | ||
y <- netflix_titles$mature[match(rownames(feature_matrix), netflix_titles$title)] | ||
mod <- cv.glmnet(feature_matrix, y, family = "binomial") | ||
plot(mod) | ||
mod$glmnet.fit %>% | ||
tidy() %>% | ||
separate(term, c("feature_type", "feature"), sep = ": ") %>% | ||
filter(lambda == mod$lambda.1se) %>% | ||
top_n(60, abs(estimate)) %>% | ||
mutate(feature = fct_reorder(feature, estimate)) %>% | ||
ggplot(aes(estimate, feature, fill = feature_type)) + | ||
geom_col() + | ||
labs(x = "Coefficient: does this make the title more likely to be TV-MA/R?", | ||
y = "", | ||
fill = "Feature Type") | ||
``` |