mirror of
https://github.com/bellingcat/reddit-post-scraping-tool.git
synced 2026-06-13 05:58:29 +03:00
Resolve conflict
This commit is contained in:
55
RPST GUI/RPST/ApiHandler.vb
Normal file
55
RPST GUI/RPST/ApiHandler.vb
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
Imports System.IO
|
||||||
|
Imports System.Net.Http
|
||||||
|
Imports Newtonsoft.Json
|
||||||
|
Imports Newtonsoft.Json.Linq
|
||||||
|
|
||||||
|
''' <summary>
|
||||||
|
''' Handles requests to Reddit and Github APIs.
|
||||||
|
''' </summary>
|
||||||
|
Public Class ApiHandler
|
||||||
|
Public Property LogFile As String = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), "RedditPostScrapingTool", "logs", $"debug.log")
|
||||||
|
Public Property Headers As String = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15"
|
||||||
|
Public Property UpdatesEndpoint As String = "https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest"
|
||||||
|
|
||||||
|
''' <summary>
|
||||||
|
''' Asyncrosnously scrape Reddit data.
|
||||||
|
''' </summary>
|
||||||
|
''' <returns>Json object containing scraped data.</returns>
|
||||||
|
Public Async Function ScrapeRedditAsync(subreddit As String, listing As String, limit As Integer, timeframe As String) As Task(Of JObject)
|
||||||
|
Dim ApiEndpoint As String = $"https://www.reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}"
|
||||||
|
Return Await GetJObjectFromEndpointAsync(endpoint:=ApiEndpoint)
|
||||||
|
End Function
|
||||||
|
|
||||||
|
''' <summary>
|
||||||
|
''' Asyncrosnously gets remote version information from the repository release page.
|
||||||
|
''' </summary>
|
||||||
|
''' <returns>Json object containing update data.</returns>
|
||||||
|
Public Async Function CheckUpdatesAsync() As Task(Of JObject)
|
||||||
|
Return Await GetJObjectFromEndpointAsync(endpoint:=UpdatesEndpoint)
|
||||||
|
End Function
|
||||||
|
|
||||||
|
''' <summary>
|
||||||
|
''' Asyncronously retrieves a JObject from the specified endpoint.
|
||||||
|
''' </summary>
|
||||||
|
''' <param name="endpoint">The URL endpoint to retrieve data from.</param>
|
||||||
|
''' <returns>A JObject containing the retrieved data.</returns>
|
||||||
|
Private Async Function GetJObjectFromEndpointAsync(endpoint As String) As Task(Of JObject)
|
||||||
|
Try
|
||||||
|
Using httpClient As New HttpClient()
|
||||||
|
httpClient.DefaultRequestHeaders.Add("User-Agent", Headers)
|
||||||
|
Dim response As HttpResponseMessage = Await httpClient.GetAsync(endpoint)
|
||||||
|
If response.IsSuccessStatusCode Then
|
||||||
|
Dim json As String = response.Content.ReadAsStringAsync().Result
|
||||||
|
Dim data As JObject = JsonConvert.DeserializeObject(Of JObject)(json)
|
||||||
|
Return data
|
||||||
|
Else
|
||||||
|
MessageBox.Show(response.ReasonPhrase, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error)
|
||||||
|
End If
|
||||||
|
End Using
|
||||||
|
Catch ex As Exception
|
||||||
|
My.Computer.FileSystem.WriteAllText(LogFile, $"{DateTime.Now}: {ex}{Environment.NewLine}", True)
|
||||||
|
MessageBox.Show($"{ex.Message}. Please see the debug log '{LogFile}' for more information.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error)
|
||||||
|
End Try
|
||||||
|
Return New JObject()
|
||||||
|
End Function
|
||||||
|
End Class
|
||||||
131
rpst/rpst.py
Normal file
131
rpst/rpst.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
import argparse
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from glyphoji import glyph
|
||||||
|
from rich import print
|
||||||
|
from rich.tree import Tree
|
||||||
|
|
||||||
|
from .utils import convert_timestamp_to_datetime, write_post_data
|
||||||
|
|
||||||
|
|
||||||
|
def create_post_branch(post: dict, keyword: str, tree: Tree, args: argparse) -> Tree:
|
||||||
|
"""
|
||||||
|
This function extracts relevant data from a Reddit post and adds it in a tree branch structure,
|
||||||
|
followed by the post's selftext.
|
||||||
|
|
||||||
|
:param post: A dictionary containing the data of a Reddit post.
|
||||||
|
:param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
|
||||||
|
:param tree: Tree where the post branch will be added.
|
||||||
|
:param args: A namespace object from argparse.
|
||||||
|
:returns: The main tree with added post branches.
|
||||||
|
"""
|
||||||
|
# Define the data to extract from the post.
|
||||||
|
post_data = {
|
||||||
|
# "Author": post["data"]["author"],
|
||||||
|
f"{glyph.id_button} ID": post["data"]["id"],
|
||||||
|
f"{glyph.people_hugging} Subreddit": post["data"]["subreddit_name_prefixed"],
|
||||||
|
f"{glyph.face_with_peeking_eye} Visibility": post["data"]["subreddit_type"],
|
||||||
|
f"{glyph.framed_picture} Thumbnail": post["data"]["thumbnail"],
|
||||||
|
f"{glyph.white_question_mark} Gilded": post["data"]["gilded"],
|
||||||
|
f"{glyph.up_arrow} Upvotes": post["data"]["ups"],
|
||||||
|
f"{glyph.chart_increasing} Upvote ratio": post["data"]["upvote_ratio"],
|
||||||
|
f"{glyph.down_arrow} Downvotes": post["data"]["downs"],
|
||||||
|
f"{glyph.trophy} Awards": post["data"]["total_awards_received"],
|
||||||
|
f"{glyph.trophy} Top award": post["data"]["top_awarded_type"],
|
||||||
|
f"{glyph.no_one_under_eighteen} Is NSFW?": post["data"]["over_18"],
|
||||||
|
f"{glyph.left_arrow_curving_right} Is crosspostable?": post["data"][
|
||||||
|
"is_crosspostable"
|
||||||
|
],
|
||||||
|
f"{glyph.bar_chart} Score": post["data"]["score"],
|
||||||
|
f"{glyph.card_file_box} Category": post["data"]["category"],
|
||||||
|
f"{glyph.globe_with_meridians} Domain": post["data"]["domain"],
|
||||||
|
f"{glyph.calendar} Posted on": convert_timestamp_to_datetime(
|
||||||
|
post["data"]["created"]
|
||||||
|
),
|
||||||
|
f"{glyph.calendar} Approved at": post["data"]["approved_at_utc"],
|
||||||
|
f"{glyph.bust_in_silhouette} Approved by": post["data"]["approved_by"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add the post's branch to the main tree.
|
||||||
|
post_branch = tree.add(f"{glyph.page_with_curl} {post['data']['title']}")
|
||||||
|
|
||||||
|
# Add each piece of extracted data as a branch of the post_branch.
|
||||||
|
for post_key, post_value in post_data.items():
|
||||||
|
post_branch.add(f"{post_key}: {post_value}", style="dim")
|
||||||
|
|
||||||
|
# This ensures that the post's selftext is also added to the written json/csv file.
|
||||||
|
post_data[f"{glyph.clipboard} Text"] = post["data"]["selftext"]
|
||||||
|
write_post_data(
|
||||||
|
filename=keyword, post_data=post_data, tree_branch=post_branch, args=args
|
||||||
|
)
|
||||||
|
post_branch.add(post["data"]["selftext"], style="italic")
|
||||||
|
|
||||||
|
return tree
|
||||||
|
|
||||||
|
|
||||||
|
def get_posts(args: argparse):
|
||||||
|
"""
|
||||||
|
Scrapes a given subreddit for posts that contain a specified keyword.
|
||||||
|
The search is limited by the number of posts and timeframe specified.
|
||||||
|
|
||||||
|
:param args: Namespace object from argparse.
|
||||||
|
|
||||||
|
Expected Object Attributes
|
||||||
|
--------------------------
|
||||||
|
- keyword: The keyword to search for in the posts.
|
||||||
|
- subreddit: The subreddit to scrape.
|
||||||
|
- listing: The type of posts to scrape. This could be 'hot', 'new', etc.
|
||||||
|
- timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
|
||||||
|
- limit: The maximum number of posts to scrape.
|
||||||
|
- json: If specified, all found posts will be written to a json file.
|
||||||
|
"""
|
||||||
|
keyword = args.keyword
|
||||||
|
subreddit = args.subreddit
|
||||||
|
listing = args.listing
|
||||||
|
timeframe = args.timeframe
|
||||||
|
limit = args.limit
|
||||||
|
|
||||||
|
# Create main result tree.
|
||||||
|
main_tree = Tree(
|
||||||
|
f"[bold]{glyph.calendar} {datetime.now()}[/]", guide_style="bold bright_blue"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start a new session
|
||||||
|
session = requests.session()
|
||||||
|
# Set the User-Agent to mimic a Safari browser on a Mac.
|
||||||
|
session.headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, "
|
||||||
|
"like Gecko) Version/14.1.1 Safari/605.1.15"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send a GET request to the specified subreddit and listing,
|
||||||
|
# limiting the response by the specified limit and timeframe.
|
||||||
|
response = session.get(
|
||||||
|
f"https://www.reddit.com/r/{subreddit}/{listing}"
|
||||||
|
f".json?limit={limit}&t={timeframe}"
|
||||||
|
).json()
|
||||||
|
|
||||||
|
# Initialize a counter for the number of posts found that contain the keyword.
|
||||||
|
found_posts = 0
|
||||||
|
|
||||||
|
# Loop through each post in the response
|
||||||
|
for post_index, post in enumerate(response["data"]["children"], start=1):
|
||||||
|
# If the keyword is found in the post's selftext or title, increment the counter and process the post.
|
||||||
|
if (
|
||||||
|
keyword.lower() in post["data"]["selftext"]
|
||||||
|
or keyword.lower() in post["data"]["title"]
|
||||||
|
):
|
||||||
|
# Create a branch for found post(s) and show post index and post author as the title
|
||||||
|
found_tree = main_tree.add(
|
||||||
|
f"{glyph.bust_in_silhouette} #{post_index} by [bold]@{post['data']['author']}[/]"
|
||||||
|
)
|
||||||
|
found_posts += 1
|
||||||
|
create_post_branch(post=post, keyword=keyword, tree=found_tree, args=args)
|
||||||
|
|
||||||
|
# Log the number of posts in which the keyword was found
|
||||||
|
main_tree.add(
|
||||||
|
f"{glyph.check_mark_button} Keyword ('{keyword}') was found in "
|
||||||
|
f"{found_posts}/{len(response['data']['children'])} {listing} posts from r/{subreddit}."
|
||||||
|
)
|
||||||
|
print(main_tree)
|
||||||
Reference in New Issue
Block a user