library(tidyverse)
library(quanteda)
library(viridis)
Read In Posts and Comments
posts <-
read_csv("./data/reddit-analytics-posts-filtered.csv") %>%
select(subreddit, title, post_text)
comments <-
read_csv("./data/reddit-analytics-comments-filtered.csv") %>%
select(subreddit, comment)
corpus_posts <-
posts %>%
mutate(text = paste(title, post_text)) %>%
pull(text) %>%
paste(collapse = ' ')
corpus_comments <-
comments %>%
pull(comment) %>%
paste(collapse = ' ')
corpus_all <- paste(corpus_posts, corpus_comments, collapse = ' ')
my_extra_stopwords <-
c("NA", "just", "also", "can", "like", "etc", "lot", "many", "much", "even", "sure")
dfm_posts <-
corpus_posts %>%
quanteda::corpus() %>%
quanteda::tokens(
remove_separators = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE) %>%
tokens_select(min_nchar=3L) %>% #Filter: at least 3 letters
quanteda::dfm(tolower = TRUE) %>%
quanteda::dfm_remove(c(my_extra_stopwords,
quanteda::stopwords("english")))
dfm_comments <-
corpus_comments %>%
quanteda::corpus() %>%
quanteda::tokens(
remove_separators = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE) %>%
tokens_select(min_nchar=3L) %>% #Filter: at least 3 letters
quanteda::dfm(tolower = TRUE) %>%
quanteda::dfm_remove(c(my_extra_stopwords,
quanteda::stopwords("english")))
Term Frequency and Document Frequency
top_words_vector_posts <-
dfm_posts %>%
quanteda::topfeatures(scheme = "count", n = 100)
top_words_posts <-
tibble(term = names(top_words_vector_posts),
n_posts = nrow(posts),
count_in_posts = top_words_vector_posts) %>%
mutate(p_posts = round(100 * count_in_posts / n_posts, 2),
p_posts = ifelse(p_posts > 100, 100, p_posts)
)
head(top_words_posts, 10)
## # A tibble: 10 × 4
## term n_posts count_in_posts p_posts
## <chr> <int> <dbl> <dbl>
## 1 data 488 656 100
## 2 analytics 488 272 55.7
## 3 sql 488 184 37.7
## 4 google 488 176 36.1
## 5 work 488 176 36.1
## 6 analyst 488 168 34.4
## 7 get 488 160 32.8
## 8 new 488 152 31.2
## 9 role 488 152 31.2
## 10 one 488 152 31.2
top_words_vector_comments<-
dfm_comments %>%
quanteda::topfeatures(scheme = "count", n = 100)
top_words_comments <-
tibble(term = names(top_words_vector_comments),
n_comments = nrow(comments),
count_in_comments = top_words_vector_comments,
) %>%
mutate(p_comments = round(100 * count_in_comments / n_comments, 2),
p_comments = ifelse(p_comments > 100, 100, p_comments)
)
head(top_words_comments, 10)
## # A tibble: 10 × 4
## term n_comments count_in_comments p_comments
## <chr> <int> <dbl> <dbl>
## 1 data 3568 2000 56.0
## 2 please 3568 976 27.4
## 3 analytics 3568 848 23.8
## 4 questions 3568 616 17.3
## 5 report 3568 568 15.9
## 6 use 3568 552 15.5
## 7 work 3568 552 15.5
## 8 get 3568 544 15.2
## 9 message 3568 528 14.8
## 10 post 3568 520 14.6
top_words <-
top_words_posts %>%
full_join(top_words_comments, join_by(term)) %>%
mutate(across(n_posts:p_comments, \(x) replace_na(x, 0)),
odds_posts = (count_in_posts / n_posts) /
((n_posts - count_in_posts) / n_posts),
odds_posts = ifelse(odds_posts < 0, Inf, odds_posts),
odds_comments = (count_in_comments / n_comments) /
((n_comments - count_in_comments) / n_comments),
odds_comments = ifelse(odds_comments < 0, Inf, odds_comments),
across(odds_posts:odds_comments, \(x) replace_na(x, 0)),
log_odds_ratio = log(odds_posts / odds_comments)
) %>%
arrange(odds_posts)
head(top_words)
## # A tibble: 6 × 10
## term n_posts count…¹ p_posts n_com…² count…³ p_com…⁴ odds_…⁵ odds_…⁶ log_o…⁷
## <chr> <int> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 please 0 0 0 3568 976 27.4 0 0.377 -Inf
## 2 quest… 0 0 0 3568 616 17.3 0 0.209 -Inf
## 3 messa… 0 0 0 3568 528 14.8 0 0.174 -Inf
## 4 post 0 0 0 3568 520 14.6 0 0.171 -Inf
## 5 bot 0 0 0 3568 512 14.4 0 0.168 -Inf
## 6 follow 0 0 0 3568 504 14.1 0 0.164 -Inf
## # … with abbreviated variable names ¹count_in_posts, ²n_comments,
## # ³count_in_comments, ⁴p_comments, ⁵odds_posts, ⁶odds_comments,
## # ⁷log_odds_ratio
comparison_plot <-
top_words %>%
select(term, p_posts, p_comments, log_odds_ratio) %>%
mutate(log_odds_ratio = abs(log_odds_ratio)) %>%
arrange(-p_posts)
head(comparison_plot)
## # A tibble: 6 × 4
## term p_posts p_comments log_odds_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 data 100 56.0 Inf
## 2 analytics 55.7 23.8 1.40
## 3 sql 37.7 13.4 1.36
## 4 google 36.1 6.73 2.06
## 5 work 36.1 15.5 1.13
## 6 analyst 34.4 6.05 2.10
Visualization