knitr::opts_chunk$set(echo = TRUE)
library(reticulate)
library(tidyverse)
library(RedditExtractoR)
library(anytime)
library(lubridate)
library(beepr)
reticulate::use_condaenv("~/r-reticulate")
import pandas as pd
import datetime as dt
Create a Reddit app and get client ID and secret: https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps
Configure PRAW in R: https://praw.readthedocs.io/en/stable/getting_started/configuration.html#configuration
Store Reddit app secrets using
Now, get started with PRAW:
import praw
reddit = praw.Reddit(
client_id = r.my_client_id,
client_secret = r.my_client_secret,
user_agent = r.my_user_agent
)
Next, we will need to install PMAW to access the Pushshift API
archive. Use the command line $: pip3 install pmaw
.
from pmaw import PushshiftAPI
api_praw = PushshiftAPI(praw = reddit)
See https://towardsdatascience.com/scraping-reddit-data-1c0af3040768
analytics_subreddit = reddit.subreddit('analytics')
print(analytics_subreddit.description)
## Dedicated to web analytics, data and business analytics. We're here to discuss analysis of data, learning of skills and implementation of web analytics.
##
## Discussions or questions on Google Analytics, statistics, R, earning qualifications, SQL and anything data related are encouraged.
##
## You need at least **5 comment karma** to post here. As in from the whole of reddit. If you don't have that, your posts will need to be manually approved so PM the mods after you've posted your thread.
##
## **[Join the community on Discord](https://discord.com/invite/q6ePcxeQja)**
##
## - [Marketing Jobs](https://lookingformarketing.com/jobs?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Books](https://lookingformarketing.com/books?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Podcasts](https://lookingformarketing.com/podcasts?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Courses](https://lookingformarketing.com/courses?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Events](https://lookingformarketing.com/events?utm_source=r_analytics&utm_medium=old_sidebar)
## - [Marketing Definitions](https://lookingformarketing.com/definitions?utm_source=r_analytics&utm_medium=old_sidebar)
##
##
## **Reddit Marketing**
## - r/marketing
## - r/socialmedia
## - r/advertising
## - r/digital_marketing
## - r/analytics
## - r/DigitalMarketing
## - r/webmarketing
## - r/AskMarketing
## - r/content_marketing
## - r/SocialMediaMarketing
## - r/GoogleAnalytics
## - r/GoogleTagManager
## - r/GoogleAdwords
## - r/GoogleDataStudio
print(analytics_subreddit.subscribers)
## 129808
# get hot posts from the r/analytics subreddit
hot_posts = reddit.subreddit('analytics').hot(limit=10)
for post in hot_posts:
print(post.title)
## Monthly Career Advice and Job Openings
## 2022 State of Marketing Survey Results
## I have an interview
## Need a new site analytics tool but no clue where to start?? Is there a site that compares them all?
## Combining Lifetime Value and Attribution Google Reports
## What are some of the best product analytics software for mobile apps in 2023?
## Changing source from UA to GA4 for the DW? Is it difficult?
## Can You Import a Customized Old GA Report into GA4?
## So what certifications actually matter in the analytics world?
## Getting into SQL as a Data Analyst
hot_posts = []
for post in analytics_subreddit.hot(limit=10):
hot_posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
hot_posts = pd.DataFrame(hot_posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
hot_posts_df <- py$hot_posts
glimpse(hot_posts_df)
## Rows: 10
## Columns: 8
## $ title <chr> "Monthly Career Advice and Job Openings", "2022 State of …
## $ score <dbl> 12, 2, 10, 3, 2, 13, 6, 1, 35, 16
## $ id <chr> "10g5ayb", "101vapj", "11264hg", "112htr8", "112hbxv", "1…
## $ subreddit <list> analytics, analytics, analytics, analytics, analytics, an…
## $ url <chr> "https://www.reddit.com/r/analytics/comments/10g5ayb/mon…
## $ num_comments <dbl> 47, 0, 22, 1, 1, 5, 1, 1, 44, 27
## $ body <chr> "1. Have a question regarding interviewing, career advice…
## $ created <dbl> 1674144014, 1672712257, 1676383423, 1676413065, 167641181…
hot_posts_df$body[1]
## [1] "1. Have a question regarding interviewing, career advice, certifications? Please include country, years of experience, vertical market, and size of business if applicable.\n2. Share your current marketing openings in the comments below. Include description, location (city/state), requirements, if it's on-site or remote, and salary.\n\nCheck out the community sidebar for other resources and our Discord link"
See for ideas: https://melaniewalsh.github.io/Intro-Cultural-Analytics/04-Data-Collection/14-Reddit-Data.html
start_epoch = int(dt.datetime(2022, 1, 1).timestamp())
end_epoch = int(dt.datetime(2023, 2, 1).timestamp())
posts_praw = api_praw.search_submissions(
#q = "learning",
subreddit = 'analytics',
after = start_epoch,
before = end_epoch,
limit = 1000
)
post_list = [post for post in posts_praw]
posts_df = pd.DataFrame(post_list)
posts_df.shape
py$posts_df$subreddit[[1]]
posts_df_r <-
py$posts_df %>%
select(created_utc, subreddit, id, author, title, selftext,
num_comments, score, ups, downs, upvote_ratio, permalink, url) %>%
rename(post_id = id,
post_date_time = created_utc,
post_text = selftext) %>%
mutate(subreddit = stringr::str_remove(permalink, "/"),
subreddit = stringr::str_remove_all(subreddit, "/comments.*"),
post_date_time = anytime::anytime(post_date_time, asUTC=TRUE),
post_date_time = lubridate::ymd_hms(lubridate::as_datetime(post_date_time)),
post_date_time = lubridate::with_tz(post_date_time, tzone='US/Eastern'),
date = date(post_date_time),
year = year(post_date_time)) #%>%
#distinct(post_id, .keep_all = TRUE)
posts_df_r$subreddit[1]; min(posts_df_r$date); max(posts_df_r$date)
write_csv(posts_df_r, "./data/reddit-analytics-posts.csv")
posts <-
read_csv("./data/reddit-analytics-posts.csv") %>%
mutate(status = ifelse(post_text == "[deleted]",
"deleted",
ifelse(post_text == "[removed]",
"removed",
"remaining")),
status = ifelse(is.na(status),
"remaining",
status),
post_url = paste0("https://www.reddit.com", permalink)
)
posts %>% count(status)
## # A tibble: 3 × 2
## status n
## <chr> <int>
## 1 deleted 48
## 2 remaining 488
## 3 removed 263
filtered_posts <-
posts %>%
filter(status == "remaining")
nrow(filtered_posts); paste("Expected number of comments:", sum(filtered_posts$num_comments))
## [1] 488
## [1] "Expected number of comments: 3816"
write_csv(filtered_posts, "./data/reddit-analytics-posts-filtered.csv")
posts2 <- read_csv("./data/reddit-analytics-posts-filtered.csv")
subreddit_vector <- unique(posts2$subreddit)
comments_with_subreddit <-
function(x) {
tmp_comments <-
posts2 %>%
filter(subreddit == x) %>%
pull(post_url) %>%
RedditExtractoR::get_thread_content() %>%
`[[`(2)
if(is.data.frame(tmp_comments)) {
tmp_comments <-
tmp_comments %>%
mutate(subreddit = x,
comment_id = as.character(comment_id))
}
return(tmp_comments)
}
comments_list <- list()
for(i in seq_along(subreddit_vector)) {
print(i)
comments_list[[i]] <- comments_with_subreddit(subreddit_vector[i])
}
beepr::beep(8)
all_comments <-
bind_rows(comments_list, .id = "column_label")
filtered_comments <-
all_comments %>%
filter(comment != "[deleted]",
comment!= "[removed]")
nrow(all_comments); nrow(filtered_comments)
write_csv(all_comments, "./data/reddit-analytics-comments.csv")
write_csv(filtered_comments, "./data/reddit-analytics-comments-filtered.csv")