library(tidyverse)
library(quanteda)
Read In Posts and Comments
posts <-
read_csv("./data/reddit-analytics-posts-filtered.csv") %>%
select(subreddit, title, post_text)
comments <-
read_csv("./data/reddit-analytics-comments-filtered.csv") %>%
select(subreddit, comment)
corpus_posts <-
posts %>%
mutate(text = paste(title, post_text)) %>%
pull(text) %>%
paste(collapse = ' ')
corpus_comments <-
comments %>%
pull(comment) %>%
paste(collapse = ' ')
corpus_all <- paste(corpus_posts, corpus_comments, collapse = ' ')
my_extra_stopwords <-
c("NA", "just", "also", "can", "like", "etc", "lot", "many", "much", "even", "sure")
dfm_all <-
corpus_all %>%
quanteda::corpus() %>%
quanteda::tokens(
remove_separators = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE) %>%
tokens_select(min_nchar=3L) %>% #Filter: at least 3 letters
quanteda::dfm(tolower = TRUE) %>%
quanteda::dfm_remove(c(my_extra_stopwords,
quanteda::stopwords("english")))
Term Frequency and Document Frequency based on whole corpus
top_words_vector <-
dfm_all %>%
quanteda::topfeatures(scheme = "count", n = 100)
top_words <-
tibble(term = names(top_words_vector),
count = top_words_vector)
head(top_words, 10)
## # A tibble: 10 × 2
## term count
## <chr> <dbl>
## 1 data 2656
## 2 analytics 1120
## 3 please 984
## 4 work 728
## 5 get 704
## 6 use 688
## 7 sql 664
## 8 questions 632
## 9 report 616
## 10 one 608