library(tidyverse)
library(quanteda)

Read In Posts and Comments

posts <- 
  read_csv("./data/reddit-analytics-posts-filtered.csv") %>%
  select(subreddit, title, post_text)
comments <- 
  read_csv("./data/reddit-analytics-comments-filtered.csv") %>%
  select(subreddit, comment)
corpus_posts <-
  posts %>%
  mutate(text = paste(title, post_text)) %>%
  pull(text) %>%
  paste(collapse = ' ')

corpus_comments <-
  comments %>%
  pull(comment) %>%
  paste(collapse = ' ')

corpus_all <- paste(corpus_posts, corpus_comments, collapse = ' ')
my_extra_stopwords <-
  c("NA", "just", "also", "can", "like", "etc", "lot", "many", "much", "even", "sure")

dfm_all <-
  corpus_all %>%
  quanteda::corpus() %>%
  quanteda::tokens(
    remove_separators = TRUE,
    remove_punct = TRUE,
    remove_symbols = TRUE,
    remove_numbers = TRUE,
    remove_url = TRUE) %>%
  tokens_select(min_nchar=3L) %>% #Filter: at least 3 letters 
  quanteda::dfm(tolower = TRUE) %>%
  quanteda::dfm_remove(c(my_extra_stopwords,
                         quanteda::stopwords("english")))

Term Frequency and Document Frequency based on whole corpus

top_words_vector <- 
  dfm_all %>%
  quanteda::topfeatures(scheme = "count", n = 100)

top_words <-
  tibble(term = names(top_words_vector), 
         count = top_words_vector)
head(top_words, 10)
## # A tibble: 10 × 2
##    term      count
##    <chr>     <dbl>
##  1 data       2656
##  2 analytics  1120
##  3 please      984
##  4 work        728
##  5 get         704
##  6 use         688
##  7 sql         664
##  8 questions   632
##  9 report      616
## 10 one         608