Collecting Reddit Data with PRAW and PushShift

knitr::opts_chunk$set(echo = TRUE)
library(reticulate)
library(tidyverse)
library(RedditExtractoR)
library(anytime)
library(lubridate)
library(beepr)

Indicate Specific Conda Environment

reticulate::use_condaenv("~/r-reticulate")

Import Python Packages (will use “~/r-reticulate” as per call to use_condaenv)

import pandas as pd
import datetime as dt

Create a Reddit app and get client ID and secret: https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps
Configure PRAW in R: https://praw.readthedocs.io/en/stable/getting_started/configuration.html#configuration
Store Reddit app secrets using

PRAW

Now, get started with PRAW:

import praw

reddit = praw.Reddit(
  client_id = r.my_client_id, 
  client_secret = r.my_client_secret, 
  user_agent = r.my_user_agent
  )

PMAW

Next, we will need to install PMAW to access the Pushshift API archive. Use the command line $: pip3 install pmaw.

from pmaw import PushshiftAPI
api_praw = PushshiftAPI(praw = reddit)

Get Hot Posts

See https://towardsdatascience.com/scraping-reddit-data-1c0af3040768

analytics_subreddit = reddit.subreddit('analytics')
print(analytics_subreddit.description)

## Dedicated to web analytics, data and business analytics. We're here to discuss analysis of data, learning of skills and implementation of web analytics. 
## 
## Discussions or questions on Google Analytics, statistics, R, earning qualifications, SQL and anything data related are encouraged. 
## 
## You need at least **5 comment karma** to post here. As in from the whole of reddit. If you don't have that, your posts will need to be manually approved so PM the mods after you've posted your thread. 
## 
## **[Join the community on Discord](https://discord.com/invite/q6ePcxeQja)**
## 
## - [Marketing Jobs](https://lookingformarketing.com/jobs?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Books](https://lookingformarketing.com/books?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Podcasts](https://lookingformarketing.com/podcasts?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Courses](https://lookingformarketing.com/courses?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Events](https://lookingformarketing.com/events?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Definitions](https://lookingformarketing.com/definitions?utm_source=r_analytics&utm_medium=old_sidebar)
## 
## 
## **Reddit Marketing**  
## - r/marketing  
## - r/socialmedia  
## - r/advertising  
## - r/digital_marketing  
## - r/analytics  
## - r/DigitalMarketing  
## - r/webmarketing  
## - r/AskMarketing  
## - r/content_marketing  
## - r/SocialMediaMarketing  
## - r/GoogleAnalytics  
## - r/GoogleTagManager  
## - r/GoogleAdwords  
## - r/GoogleDataStudio

print(analytics_subreddit.subscribers)

## 129808

# get hot posts from the r/analytics subreddit
hot_posts = reddit.subreddit('analytics').hot(limit=10)
for post in hot_posts:
    print(post.title)

## Monthly Career Advice and Job Openings
## 2022 State of Marketing Survey Results
## I have an interview
## Need a new site analytics tool but no clue where to start?? Is there a site that compares them all?
## Combining Lifetime Value and Attribution Google Reports
## What are some of the best product analytics software for mobile apps in 2023?
## Changing source from UA to GA4 for the DW? Is it difficult?
## Can You Import a Customized Old GA Report into GA4?
## So what certifications actually matter in the analytics world?
## Getting into SQL as a Data Analyst

hot_posts = []
for post in analytics_subreddit.hot(limit=10):
    hot_posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
hot_posts = pd.DataFrame(hot_posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])

hot_posts_df <- py$hot_posts
glimpse(hot_posts_df)

## Rows: 10
## Columns: 8
## $ title        <chr> "Monthly Career Advice and Job Openings", "2022 State of …
## $ score        <dbl> 12, 2, 10, 3, 2, 13, 6, 1, 35, 16
## $ id           <chr> "10g5ayb", "101vapj", "11264hg", "112htr8", "112hbxv", "1…
## $ subreddit    <list> analytics, analytics, analytics, analytics, analytics, an…
## $ url          <chr> "https://www.reddit.com/r/analytics/comments/10g5ayb/mon…
## $ num_comments <dbl> 47, 0, 22, 1, 1, 5, 1, 1, 44, 27
## $ body         <chr> "1. Have a question regarding interviewing, career advice…
## $ created      <dbl> 1674144014, 1672712257, 1676383423, 1676413065, 167641181…

hot_posts_df$body[1]

## [1] "1. Have a question regarding interviewing, career advice, certifications?  Please include country, years of experience, vertical market, and size of business if applicable.\n2. Share your current marketing openings in the comments below. Include description, location (city/state), requirements, if it's on-site or remote, and salary.\n\nCheck out the community sidebar for other resources and our Discord link"

Get Posts

See for ideas: https://melaniewalsh.github.io/Intro-Cultural-Analytics/04-Data-Collection/14-Reddit-Data.html

start_epoch = int(dt.datetime(2022, 1, 1).timestamp())
end_epoch = int(dt.datetime(2023, 2, 1).timestamp())

posts_praw = api_praw.search_submissions(
  #q = "learning", 
  subreddit = 'analytics', 
  after = start_epoch,
  before = end_epoch,
  limit = 1000
)

post_list = [post for post in posts_praw]
posts_df = pd.DataFrame(post_list)
posts_df.shape

py$posts_df$subreddit[[1]]

posts_df_r <- 
  py$posts_df %>% 
  select(created_utc, subreddit, id, author, title, selftext, 
         num_comments, score, ups, downs, upvote_ratio, permalink, url) %>%
  rename(post_id = id,
         post_date_time = created_utc,
         post_text = selftext) %>%
  mutate(subreddit = stringr::str_remove(permalink, "/"),
         subreddit = stringr::str_remove_all(subreddit, "/comments.*"),
         post_date_time = anytime::anytime(post_date_time, asUTC=TRUE),
         post_date_time = lubridate::ymd_hms(lubridate::as_datetime(post_date_time)),
         post_date_time = lubridate::with_tz(post_date_time, tzone='US/Eastern'),
         date = date(post_date_time),
         year = year(post_date_time)) #%>%
  #distinct(post_id, .keep_all = TRUE)

posts_df_r$subreddit[1]; min(posts_df_r$date); max(posts_df_r$date)

write_csv(posts_df_r, "./data/reddit-analytics-posts.csv")

posts <- 
  read_csv("./data/reddit-analytics-posts.csv") %>%
  mutate(status = ifelse(post_text == "[deleted]",
                         "deleted",
                         ifelse(post_text == "[removed]",
                                "removed",
                                "remaining")),
         status = ifelse(is.na(status),
                         "remaining",
                         status),
         post_url = paste0("https://www.reddit.com", permalink)
  )
posts %>% count(status)

## # A tibble: 3 × 2
##   status        n
##   <chr>     <int>
## 1 deleted      48
## 2 remaining   488
## 3 removed     263

filtered_posts <-
  posts %>%
  filter(status == "remaining")
nrow(filtered_posts); paste("Expected number of comments:", sum(filtered_posts$num_comments))

## [1] 488

## [1] "Expected number of comments: 3816"

write_csv(filtered_posts, "./data/reddit-analytics-posts-filtered.csv")

Get Comments

posts2 <- read_csv("./data/reddit-analytics-posts-filtered.csv")

subreddit_vector <- unique(posts2$subreddit)

comments_with_subreddit <-
  function(x) {
    tmp_comments <-  
      posts2 %>%
      filter(subreddit == x) %>%
      pull(post_url) %>%
      RedditExtractoR::get_thread_content() %>%
      `[[`(2)
    if(is.data.frame(tmp_comments)) {
      tmp_comments <-
        tmp_comments %>%
        mutate(subreddit = x,
               comment_id = as.character(comment_id))
    }
    return(tmp_comments)
  }

comments_list <- list()
for(i in seq_along(subreddit_vector)) {
  print(i)
  comments_list[[i]] <- comments_with_subreddit(subreddit_vector[i])
}
beepr::beep(8)

all_comments <- 
  bind_rows(comments_list, .id = "column_label")

filtered_comments <-
  all_comments %>%
  filter(comment != "[deleted]",
         comment!= "[removed]")

nrow(all_comments); nrow(filtered_comments)

write_csv(all_comments, "./data/reddit-analytics-comments.csv")
write_csv(filtered_comments, "./data/reddit-analytics-comments-filtered.csv")

Collecting Reddit Data with PRAW and PushShift

Analytics Sandbox

K. Bret Staudt Willet | Florida State University

February 14, 2023

Indicate Specific Conda Environment

Import Python Packages (will use “~/r-reticulate” as per call to use_condaenv)

PRAW

PMAW

Get Hot Posts

Get Posts

Get Comments