knitr::opts_chunk$set(echo = TRUE)

Indicate Specific Conda Environment


Import Python Packages (will use “~/r-reticulate” as per call to use_condaenv)

import pandas as pd
import datetime as dt
  1. Create a Reddit app and get client ID and secret:

  2. Configure PRAW in R:

  3. Store Reddit app secrets using


Now, get started with PRAW:

import praw

reddit = praw.Reddit(
  client_id = r.my_client_id, 
  client_secret = r.my_client_secret, 
  user_agent = r.my_user_agent


Next, we will need to install PMAW to access the Pushshift API archive. Use the command line $: pip3 install pmaw.

from pmaw import PushshiftAPI
api_praw = PushshiftAPI(praw = reddit)

Get Hot Posts


analytics_subreddit = reddit.subreddit('analytics')
## 129808
# get hot posts from the r/analytics subreddit
hot_posts = reddit.subreddit('analytics').hot(limit=10)
for post in hot_posts:
hot_posts = []
for post in
    hot_posts.append([post.title, post.score,, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
hot_posts = pd.DataFrame(hot_posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
hot_posts_df <- py$hot_posts
## Rows: 10
## Columns: 8
## $ title        <chr> "Monthly Career Advice and Job Openings", "2022 State of …
## $ score        <dbl> 12, 2, 10, 3, 2, 13, 6, 1, 35, 16
## $ id           <chr> "10g5ayb", "101vapj", "11264hg", "112htr8", "112hbxv", "1…
## $ subreddit    <list> analytics, analytics, analytics, analytics, analytics, an…
## $ url          <chr> "…
## $ num_comments <dbl> 47, 0, 22, 1, 1, 5, 1, 1, 44, 27
## $ body         <chr> "1. Have a question regarding interviewing, career advice…
## $ created      <dbl> 1674144014, 1672712257, 1676383423, 1676413065, 167641181…
## [1] "1. Have a question regarding interviewing, career advice, certifications?  Please include country, years of experience, vertical market, and size of business if applicable.\n2. Share your current marketing openings in the comments below. Include description, location (city/state), requirements, if it's on-site or remote, and salary.\n\nCheck out the community sidebar for other resources and our Discord link"

Get Posts

See for ideas:

start_epoch = int(dt.datetime(2022, 1, 1).timestamp())
end_epoch = int(dt.datetime(2023, 2, 1).timestamp())
posts_praw = api_praw.search_submissions(
  #q = "learning", 
  subreddit = 'analytics', 
  after = start_epoch,
  before = end_epoch,
  limit = 1000

post_list = [post for post in posts_praw]
posts_df = pd.DataFrame(post_list)
posts_df_r <- 
  py$posts_df %>% 
  select(created_utc, subreddit, id, author, title, selftext, 
         num_comments, score, ups, downs, upvote_ratio, permalink, url) %>%
  rename(post_id = id,
         post_date_time = created_utc,
         post_text = selftext) %>%
  mutate(subreddit = stringr::str_remove(permalink, "/"),
         subreddit = stringr::str_remove_all(subreddit, "/comments.*"),
         post_date_time = anytime::anytime(post_date_time, asUTC=TRUE),
         post_date_time = lubridate::ymd_hms(lubridate::as_datetime(post_date_time)),
         post_date_time = lubridate::with_tz(post_date_time, tzone='US/Eastern'),
         date = date(post_date_time),
         year = year(post_date_time)) #%>%
  #distinct(post_id, .keep_all = TRUE)

posts_df_r$subreddit[1]; min(posts_df_r$date); max(posts_df_r$date)
write_csv(posts_df_r, "./data/reddit-analytics-posts.csv")
posts <- 
  read_csv("./data/reddit-analytics-posts.csv") %>%
  mutate(status = ifelse(post_text == "[deleted]",
                         ifelse(post_text == "[removed]",
         status = ifelse(,
         post_url = paste0("", permalink)
posts %>% count(status)
## # A tibble: 3 × 2
##   status        n
##   <chr>     <int>
## 1 deleted      48
## 2 remaining   488
## 3 removed     263
filtered_posts <-
  posts %>%
  filter(status == "remaining")
nrow(filtered_posts); paste("Expected number of comments:", sum(filtered_posts$num_comments))
## [1] 488
## [1] "Expected number of comments: 3816"
write_csv(filtered_posts, "./data/reddit-analytics-posts-filtered.csv")

Get Comments

posts2 <- read_csv("./data/reddit-analytics-posts-filtered.csv")
subreddit_vector <- unique(posts2$subreddit)

comments_with_subreddit <-
  function(x) {
    tmp_comments <-  
      posts2 %>%
      filter(subreddit == x) %>%
      pull(post_url) %>%
      RedditExtractoR::get_thread_content() %>%
    if( {
      tmp_comments <-
        tmp_comments %>%
        mutate(subreddit = x,
               comment_id = as.character(comment_id))

comments_list <- list()
for(i in seq_along(subreddit_vector)) {
  comments_list[[i]] <- comments_with_subreddit(subreddit_vector[i])

all_comments <- 
  bind_rows(comments_list, .id = "column_label")

filtered_comments <-
  all_comments %>%
  filter(comment != "[deleted]",
         comment!= "[removed]")

nrow(all_comments); nrow(filtered_comments)
write_csv(all_comments, "./data/reddit-analytics-comments.csv")
write_csv(filtered_comments, "./data/reddit-analytics-comments-filtered.csv")