library(tidyverse)
library(quanteda)
library(viridis)

Read In Posts and Comments

posts <- 
  read_csv("./data/reddit-analytics-posts-filtered.csv") %>%
  select(subreddit, title, post_text)
comments <- 
  read_csv("./data/reddit-analytics-comments-filtered.csv") %>%
  select(subreddit, comment)
corpus_posts <-
  posts %>%
  mutate(text = paste(title, post_text)) %>%
  pull(text) %>%
  paste(collapse = ' ')

corpus_comments <-
  comments %>%
  pull(comment) %>%
  paste(collapse = ' ')

corpus_all <- paste(corpus_posts, corpus_comments, collapse = ' ')
my_extra_stopwords <-
  c("NA", "just", "also", "can", "like", "etc", "lot", "many", "much", "even", "sure")

dfm_posts <-
  corpus_posts %>%
  quanteda::corpus() %>%
  quanteda::tokens(
    remove_separators = TRUE,
    remove_punct = TRUE,
    remove_symbols = TRUE,
    remove_numbers = TRUE,
    remove_url = TRUE) %>%
  tokens_select(min_nchar=3L) %>% #Filter: at least 3 letters 
  quanteda::dfm(tolower = TRUE) %>%
  quanteda::dfm_remove(c(my_extra_stopwords,
                         quanteda::stopwords("english")))

dfm_comments <-
  corpus_comments %>%
  quanteda::corpus() %>%
  quanteda::tokens(
    remove_separators = TRUE,
    remove_punct = TRUE,
    remove_symbols = TRUE,
    remove_numbers = TRUE,
    remove_url = TRUE) %>%
  tokens_select(min_nchar=3L) %>% #Filter: at least 3 letters 
  quanteda::dfm(tolower = TRUE) %>%
  quanteda::dfm_remove(c(my_extra_stopwords,
                         quanteda::stopwords("english")))

Term Frequency and Document Frequency

top_words_vector_posts <- 
  dfm_posts %>%
  quanteda::topfeatures(scheme = "count", n = 100)

top_words_posts <-
  tibble(term = names(top_words_vector_posts), 
         n_posts = nrow(posts),
         count_in_posts = top_words_vector_posts) %>%
  mutate(p_posts = round(100 * count_in_posts / n_posts, 2),
         p_posts = ifelse(p_posts > 100, 100, p_posts)
         )
head(top_words_posts, 10)
## # A tibble: 10 × 4
##    term      n_posts count_in_posts p_posts
##    <chr>       <int>          <dbl>   <dbl>
##  1 data          488            656   100  
##  2 analytics     488            272    55.7
##  3 sql           488            184    37.7
##  4 google        488            176    36.1
##  5 work          488            176    36.1
##  6 analyst       488            168    34.4
##  7 get           488            160    32.8
##  8 new           488            152    31.2
##  9 role          488            152    31.2
## 10 one           488            152    31.2
top_words_vector_comments<- 
  dfm_comments %>%
  quanteda::topfeatures(scheme = "count", n = 100)

top_words_comments <-
  tibble(term = names(top_words_vector_comments),
         n_comments = nrow(comments),
         count_in_comments = top_words_vector_comments,
         ) %>%
  mutate(p_comments = round(100 * count_in_comments / n_comments, 2),
         p_comments = ifelse(p_comments > 100, 100, p_comments)
         )
head(top_words_comments, 10)
## # A tibble: 10 × 4
##    term      n_comments count_in_comments p_comments
##    <chr>          <int>             <dbl>      <dbl>
##  1 data            3568              2000       56.0
##  2 please          3568               976       27.4
##  3 analytics       3568               848       23.8
##  4 questions       3568               616       17.3
##  5 report          3568               568       15.9
##  6 use             3568               552       15.5
##  7 work            3568               552       15.5
##  8 get             3568               544       15.2
##  9 message         3568               528       14.8
## 10 post            3568               520       14.6
top_words <-
  top_words_posts %>%
  full_join(top_words_comments, join_by(term)) %>%
  mutate(across(n_posts:p_comments, \(x) replace_na(x, 0)),
         odds_posts = (count_in_posts / n_posts) / 
           ((n_posts - count_in_posts) / n_posts),
         odds_posts = ifelse(odds_posts < 0, Inf, odds_posts),
         odds_comments = (count_in_comments / n_comments) / 
           ((n_comments - count_in_comments) / n_comments),
         odds_comments = ifelse(odds_comments < 0, Inf, odds_comments),
         across(odds_posts:odds_comments, \(x) replace_na(x, 0)),
         log_odds_ratio = log(odds_posts / odds_comments)
         ) %>%
  arrange(odds_posts)
head(top_words)
## # A tibble: 6 × 10
##   term   n_posts count…¹ p_posts n_com…² count…³ p_com…⁴ odds_…⁵ odds_…⁶ log_o…⁷
##   <chr>    <int>   <dbl>   <dbl>   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 please       0       0       0    3568     976    27.4       0   0.377    -Inf
## 2 quest…       0       0       0    3568     616    17.3       0   0.209    -Inf
## 3 messa…       0       0       0    3568     528    14.8       0   0.174    -Inf
## 4 post         0       0       0    3568     520    14.6       0   0.171    -Inf
## 5 bot          0       0       0    3568     512    14.4       0   0.168    -Inf
## 6 follow       0       0       0    3568     504    14.1       0   0.164    -Inf
## # … with abbreviated variable names ¹​count_in_posts, ²​n_comments,
## #   ³​count_in_comments, ⁴​p_comments, ⁵​odds_posts, ⁶​odds_comments,
## #   ⁷​log_odds_ratio
comparison_plot <- 
  top_words %>%
  select(term, p_posts, p_comments, log_odds_ratio) %>%
  mutate(log_odds_ratio = abs(log_odds_ratio)) %>%
  arrange(-p_posts)
head(comparison_plot)
## # A tibble: 6 × 4
##   term      p_posts p_comments log_odds_ratio
##   <chr>       <dbl>      <dbl>          <dbl>
## 1 data        100        56.0          Inf   
## 2 analytics    55.7      23.8            1.40
## 3 sql          37.7      13.4            1.36
## 4 google       36.1       6.73           2.06
## 5 work         36.1      15.5            1.13
## 6 analyst      34.4       6.05           2.10

Visualization