mirror of
https://github.com/bellingcat/reddit-post-scraping-tool.git
synced 2026-06-08 03:28:30 +03:00
29
README.md
29
README.md
@@ -1,20 +1,18 @@
|
||||
# RPST (Reddit Post Scraping Tool)
|
||||
Given a subreddit name and a keyword, RPST will return all posts from a specified listing (default is 'top') that contain the provided keyword.
|
||||
|
||||
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
|
||||

|
||||

|
||||
***
|
||||
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
|
||||
|
||||
# ✅ Features
|
||||
## GUI
|
||||
- [x] Dark mode (Right-click)
|
||||
- [x] Saves results to a JSON (Right-click)
|
||||
- [x] Dark mode (*Right-click*)
|
||||
- [x] Saves results to a JSON file (*Right-click*)
|
||||
- [x] Logs errors to a file
|
||||
|
||||
## CLI
|
||||
- [x] Saves results to a JSON (-j/--json)
|
||||
- [x] Automatically checks for new updates. Notifies user if update were found.
|
||||
- [x] Saves results to JSON (*specifiy* `--json`)
|
||||
- [x] Saves results to CSV (*specify* `--csv`)
|
||||
- [x] Automatically checks for new updates, and notifies user if updates were found.
|
||||
|
||||
# 📃 TODO
|
||||
## GUI
|
||||
@@ -22,8 +20,21 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
|
||||
- [x] Add manual dark mode option, that will be persistent in all sessions
|
||||
- [ ] Make it save results to a CSV file
|
||||
|
||||
# Images & Screenshots
|
||||
## GUI
|
||||
* 
|
||||
* 
|
||||
|
||||
## CLI
|
||||
* 
|
||||
* 
|
||||
* 
|
||||
|
||||
|
||||
|
||||
|
||||
# 📖 Wiki
|
||||
[Refer to the Wiki](https://github.com/rly0nheart/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
|
||||
[Refer to the Wiki](https://github.com/bellingcat/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
|
||||
|
||||
# 😁 Donations
|
||||
If you like `RPST` and would like to show support, you can Buy A Coffee for the developer using the button below
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
# RPST (Reddit Post Scraping Tool)
|
||||
Given a subreddit name and a keyword, RPST will return all posts from a specified listing (default is 'top') that contain the provided keyword.
|
||||
|
||||
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
|
||||

|
||||

|
||||
***
|
||||
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
|
||||
|
||||
# ✅ Features
|
||||
## GUI
|
||||
- [x] Dark mode (Right-click)
|
||||
- [x] Saves results to a JSON (Right-click)
|
||||
- [x] Dark mode (*Right-click*)
|
||||
- [x] Saves results to a JSON file (*Right-click*)
|
||||
- [x] Logs errors to a file
|
||||
|
||||
## CLI
|
||||
- [x] Saves results to a JSON (-j/--json)
|
||||
- [x] Automatically checks for new updates. Notifies user if update were found.
|
||||
- [x] Saves results to JSON (*specifiy* `--json`)
|
||||
- [x] Saves results to CSV (*specify* `--csv`)
|
||||
- [x] Automatically checks for new updates, and notifies user if updates were found.
|
||||
|
||||
# 📃 TODO
|
||||
## GUI
|
||||
@@ -22,8 +20,21 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
|
||||
- [x] Add manual dark mode option, that will be persistent in all sessions
|
||||
- [ ] Make it save results to a CSV file
|
||||
|
||||
# Images & Screenshots
|
||||
## GUI
|
||||
* 
|
||||
* 
|
||||
|
||||
## CLI
|
||||
* 
|
||||
* 
|
||||
* 
|
||||
|
||||
|
||||
|
||||
|
||||
# 📖 Wiki
|
||||
[Refer to the Wiki](https://github.com/rly0nheart/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
|
||||
[Refer to the Wiki](https://github.com/bellingcat/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
|
||||
|
||||
# 😁 Donations
|
||||
If you like `RPST` and would like to show support, you can Buy A Coffee for the developer using the button below
|
||||
|
||||
@@ -13,11 +13,11 @@
|
||||
<PackageProjectUrl>https://github.com/bellingcat/reddit-post-scraping-tool</PackageProjectUrl>
|
||||
<PackageReadmeFile>README.md</PackageReadmeFile>
|
||||
<RepositoryUrl>https://github.com/bellingcat/reddit-post-scraping-tool</RepositoryUrl>
|
||||
<AssemblyVersion>1.6.2.0</AssemblyVersion>
|
||||
<FileVersion>1.6.2.0</FileVersion>
|
||||
<AssemblyVersion>1.7.0.0</AssemblyVersion>
|
||||
<FileVersion>1.7.0.0</FileVersion>
|
||||
<PackageLicenseFile>LICENSE</PackageLicenseFile>
|
||||
<PackageRequireLicenseAcceptance>True</PackageRequireLicenseAcceptance>
|
||||
<Version>1.6.2</Version>
|
||||
<Version>1.7.0</Version>
|
||||
<PackageTags>reddit;scraper;reddit-scraper;osint</PackageTags>
|
||||
<PackageReleaseNotes></PackageReleaseNotes>
|
||||
<AnalysisLevel>6.0-recommended</AnalysisLevel>
|
||||
@@ -39,7 +39,7 @@
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Newtonsoft.Json" Version="13.0.2" />
|
||||
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
@@ -78,4 +78,4 @@
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -7,18 +7,18 @@ packages = ["rpst"]
|
||||
|
||||
[project]
|
||||
name = "reddit-post-scraping-tool"
|
||||
version = "1.6.2.0"
|
||||
version = "1.7.0.0"
|
||||
description = "Given a subreddit name and a keyword, RPST returns all top (by default) posts that contain the specified keyword."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
license = {file = "LICENSE"}
|
||||
keywords = ["osint", "reddit-crawler", "reddit-scraping", "reddit"]
|
||||
keywords = ["reddit-crawler", "reddit-scraping", "reddit", "reddit-api"]
|
||||
authors = [{name = "Richard Mwewa", email = "rly0nheart@duck.com"}]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Visual Basic",
|
||||
"Intended Audience :: Information Technology",
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Natural Language :: English"
|
||||
@@ -26,6 +26,7 @@ classifiers = [
|
||||
|
||||
dependencies = [
|
||||
"rich",
|
||||
"glyphoji",
|
||||
"requests",
|
||||
]
|
||||
|
||||
@@ -35,4 +36,4 @@ documentation = "https://github.com/bellingcat/reddit-post-scraping-tool/wiki"
|
||||
repository = "https://github.com/bellingcat/reddit-post-scraping-tool.git"
|
||||
|
||||
[project.scripts]
|
||||
rpst = "rpst.__main:run"
|
||||
rpst = "rpst.main:run"
|
||||
|
||||
263
rpst/__rpst.py
263
rpst/__rpst.py
@@ -1,263 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from rich.tree import Tree
|
||||
from rich import print as xprint
|
||||
from rich.markdown import Markdown
|
||||
from rich.logging import RichHandler
|
||||
|
||||
|
||||
def convert_timestamp_to_datetime(timestamp: int) -> str:
|
||||
"""
|
||||
Converts a Unix timestamp to a formatted datetime string.
|
||||
|
||||
:param timestamp: The Unix timestamp to be converted.
|
||||
:return: A formatted datetime string in the format "dd MMMM yyyy, hh:mm:ssAM/PM".
|
||||
"""
|
||||
utc_from_timestamp = datetime.utcfromtimestamp(timestamp)
|
||||
datetime_object = utc_from_timestamp.strftime("%d %B %Y, %I:%M:%S%p")
|
||||
return datetime_object
|
||||
|
||||
|
||||
def write_post_data(post_data: dict, filename: str) -> str:
|
||||
"""
|
||||
Writes post data to a specified JSON file.
|
||||
|
||||
:param post_data: A dictionary containing post data.
|
||||
:param filename: The name of the file to which post data will be written.
|
||||
:returns: A string representation of the file path.
|
||||
"""
|
||||
home_directory = os.path.expanduser("~")
|
||||
file_path = os.path.join(home_directory, f"{filename}.json")
|
||||
|
||||
# Write the data to a JSON file
|
||||
with open(file_path, "a") as file:
|
||||
file.write(json.dumps(post_data))
|
||||
file.write("\n") # write a newline to separate posts.
|
||||
|
||||
return file.name
|
||||
|
||||
|
||||
def check_updates(version_tag: str):
|
||||
"""
|
||||
This function checks if there's a new release of a project on GitHub. If there is, it logs an
|
||||
information message and prints the release notes.
|
||||
|
||||
:param version_tag: A string representing the current version of the project.
|
||||
"""
|
||||
|
||||
# Make a GET request to the GitHub API to get the latest release of the project.
|
||||
response = requests.get(
|
||||
"https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest"
|
||||
).json()
|
||||
|
||||
# Check if the latest release's tag matches the current version tag.
|
||||
if response["tag_name"] != version_tag:
|
||||
# If not, convert the release notes from Markdown to HTML.
|
||||
raw_release_notes = response["body"]
|
||||
markdown_release_notes = Markdown(raw_release_notes)
|
||||
|
||||
# Log an info message about the new release.
|
||||
log.info(
|
||||
f"A new release of RPST is available ({response['tag_name']}). "
|
||||
f"Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates."
|
||||
)
|
||||
|
||||
# Print the release notes.
|
||||
xprint(markdown_release_notes)
|
||||
|
||||
|
||||
def create_post_branch(post: dict, keyword: str, output: bool, tree: Tree) -> Tree:
|
||||
"""
|
||||
This function extracts relevant data from a Reddit post and adds it in a tree branch structure,
|
||||
followed by the post's selftext.
|
||||
|
||||
:param post: A dictionary containing the data of a Reddit post.
|
||||
:param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
|
||||
:param output: If specified, all found posts will be written to a json file.
|
||||
:param tree: Tree where the post branch will be added.
|
||||
:returns: The main tree with added post branches.
|
||||
"""
|
||||
# Define the data to extract from the post.
|
||||
post_data = {
|
||||
# "Author": post["data"]["author"],
|
||||
"ID": post["data"]["id"],
|
||||
"Subreddit": post["data"]["subreddit_name_prefixed"],
|
||||
"Visibility": post["data"]["subreddit_type"],
|
||||
"Thumbnail": post["data"]["thumbnail"],
|
||||
"Gilded": post["data"]["gilded"],
|
||||
"Upvotes": post["data"]["ups"],
|
||||
"Upvote ratio": post["data"]["upvote_ratio"],
|
||||
"Downvotes": post["data"]["downs"],
|
||||
"Awards": post["data"]["total_awards_received"],
|
||||
"Top award": post["data"]["top_awarded_type"],
|
||||
"Is NSFW?": post["data"]["over_18"],
|
||||
"Is crosspostable?": post["data"]["is_crosspostable"],
|
||||
"Score": post["data"]["score"],
|
||||
"Category": post["data"]["category"],
|
||||
"Domain": post["data"]["domain"],
|
||||
"Posted on": convert_timestamp_to_datetime(post["data"]["created"]),
|
||||
"Approved at": post["data"]["approved_at_utc"],
|
||||
"Approved by": post["data"]["approved_by"],
|
||||
}
|
||||
|
||||
# Add the post's branch to the main tree.
|
||||
post_branch = tree.add(f":scroll: {post['data']['title']}")
|
||||
|
||||
# Add each piece of extracted data as a branch of the post_branch.
|
||||
for post_key, post_value in post_data.items():
|
||||
post_branch.add(f"{post_key}: {post_value}", style="dim")
|
||||
|
||||
# If -j/--json is passed, write found posts to a json file.
|
||||
if output:
|
||||
# This ensures that the post's selftext is also added to the written json file.
|
||||
post_data["Text"] = post["data"]["selftext"]
|
||||
output_file = write_post_data(filename=keyword, post_data=post_data)
|
||||
tree.add(
|
||||
f":page_facing_up: Post data written/appended to "
|
||||
f"[italic][link file://{output_file}]{output_file}[/]"
|
||||
)
|
||||
post_branch.add(post["data"]["selftext"], style="italic")
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
def get_posts(arguments: argparse):
|
||||
"""
|
||||
Scrapes a given subreddit for posts that contain a specified keyword.
|
||||
The search is limited by the number of posts and timeframe specified.
|
||||
|
||||
:param arguments: Namespace object from argparse.
|
||||
|
||||
Expected Object Attributes
|
||||
--------------------------
|
||||
- keyword: The keyword to search for in the posts.
|
||||
- subreddit: The subreddit to scrape.
|
||||
- listing: The type of posts to scrape. This could be 'hot', 'new', etc.
|
||||
- timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
|
||||
- limit: The maximum number of posts to scrape.
|
||||
- json: If specified, all found posts will be written to a json file.
|
||||
"""
|
||||
keyword = arguments.keyword
|
||||
subreddit = arguments.subreddit
|
||||
listing = arguments.listing
|
||||
timeframe = arguments.timeframe
|
||||
limit = arguments.limit
|
||||
json_output = arguments.json
|
||||
|
||||
# Create main result tree.
|
||||
main_tree = Tree(f"[bold]{datetime.now()}[/]", guide_style="bold bright_blue")
|
||||
|
||||
# Start a new session
|
||||
session = requests.session()
|
||||
# Set the User-Agent to mimic a Safari browser on a Mac.
|
||||
session.headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, "
|
||||
"like Gecko) Version/14.1.1 Safari/605.1.15"
|
||||
}
|
||||
|
||||
# Send a GET request to the specified subreddit and listing,
|
||||
# limiting the response by the specified limit and timeframe.
|
||||
response = session.get(
|
||||
f"https://reddit.com/r/{subreddit}/{listing}"
|
||||
f".json?limit={limit}&t={timeframe}"
|
||||
).json()
|
||||
|
||||
# Initialize a counter for the number of posts found that contain the keyword.
|
||||
found_posts = 0
|
||||
|
||||
# Loop through each post in the response
|
||||
for post_index, post in enumerate(response["data"]["children"], start=1):
|
||||
# If the keyword is found in the post's selftext or title, increment the counter and process the post.
|
||||
if (
|
||||
keyword.lower() in post["data"]["selftext"]
|
||||
or keyword.lower() in post["data"]["title"]
|
||||
):
|
||||
# Create a branch for found post(s) and show post index and post author as the title
|
||||
found_tree = main_tree.add(
|
||||
f":bust_in_silhouette: #{post_index} by [bold]@{post['data']['author']}[/]"
|
||||
)
|
||||
found_posts += 1
|
||||
create_post_branch(
|
||||
post=post,
|
||||
keyword=keyword,
|
||||
output=json_output,
|
||||
tree=found_tree,
|
||||
)
|
||||
|
||||
# Log the number of posts in which the keyword was found
|
||||
main_tree.add(
|
||||
f"Keyword ('{keyword}') was found in {found_posts}/{len(response['data']['children'])} "
|
||||
f"{listing} posts from r/{subreddit}."
|
||||
)
|
||||
xprint(main_tree)
|
||||
|
||||
|
||||
def create_parser():
|
||||
"""
|
||||
Creates and configures an argument parser for the command line arguments.
|
||||
|
||||
:return: A configured argparse.ArgumentParser object ready to parse the command line arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="RPST (Reddit Post Scraping Tool) —by Richard Mwewa | https://about.me/rly0nheart",
|
||||
epilog="Given a subreddit name and a keyword, "
|
||||
"RPST returns all top (by default) posts that contain the specified keyword.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-k", "--keyword", help="The keyword to search for in the posts.", required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s", "--subreddit", help="The subreddit to scrape.", required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--limit",
|
||||
help="The maximum number of posts to scrape (1-100). (default: %(default)s)",
|
||||
default=10,
|
||||
type=int,
|
||||
choices=range(
|
||||
1, 101
|
||||
), # This enforces that the limit must be between 1 and 100 inclusive.
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--listing",
|
||||
default="top",
|
||||
const="top",
|
||||
nargs="?",
|
||||
choices=["controversial", "hot", "best", "new", "rising"],
|
||||
help="The type of posts to scrape (default: %(default)s)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--timeframe",
|
||||
default="all",
|
||||
const="all",
|
||||
nargs="?",
|
||||
choices=["hour", "day", "week", "month", "year", "all"],
|
||||
help="The timeframe from which to scrape posts (default: %(default)s)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-j",
|
||||
"--json",
|
||||
help="Write all found posts to a json file.",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level="NOTSET",
|
||||
format="%(message)s",
|
||||
handlers=[
|
||||
RichHandler(markup=True, log_time_format="[%H:%M:%S%p]", show_level=False)
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("rich")
|
||||
@@ -1,5 +1,7 @@
|
||||
from datetime import datetime
|
||||
from rpst.__rpst import log, get_posts, check_updates, create_parser
|
||||
|
||||
from .rpst import get_posts
|
||||
from .utils import create_parser, set_loglevel, check_updates
|
||||
|
||||
|
||||
def run():
|
||||
@@ -10,20 +12,22 @@ def run():
|
||||
|
||||
# Create a parser and parse the command line arguments
|
||||
parser = create_parser()
|
||||
arguments = parser.parse_args()
|
||||
args = parser.parse_args()
|
||||
|
||||
log = set_loglevel(args=args)
|
||||
|
||||
# Record the start time
|
||||
start_time = datetime.now()
|
||||
|
||||
try:
|
||||
# Check for updates
|
||||
check_updates(version_tag="1.6.2.0")
|
||||
check_updates(version_tag="1.7.0.0")
|
||||
|
||||
# Get posts with the provided/parsed arguments
|
||||
get_posts(arguments=arguments)
|
||||
get_posts(args=args)
|
||||
except KeyboardInterrupt:
|
||||
log.warning("User interruption detected.")
|
||||
except Exception as e:
|
||||
log.error(f"An error occurred: {e}")
|
||||
finally:
|
||||
log.info(f'Finished in {datetime.now() - start_time} seconds.')
|
||||
log.info(f"Finished in {datetime.now() - start_time} seconds.")
|
||||
131
rpst/rpst.py
Normal file
131
rpst/rpst.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from glyphoji import glyph
|
||||
from rich.tree import Tree
|
||||
from rich import print as xprint
|
||||
|
||||
from .utils import convert_timestamp_to_datetime, write_post_data
|
||||
|
||||
|
||||
def create_post_branch(post: dict, keyword: str, tree: Tree, args: argparse) -> Tree:
|
||||
"""
|
||||
This function extracts relevant data from a Reddit post and adds it in a tree branch structure,
|
||||
followed by the post's selftext.
|
||||
|
||||
:param post: A dictionary containing the data of a Reddit post.
|
||||
:param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
|
||||
:param tree: Tree where the post branch will be added.
|
||||
:param args: A namespace object from argparse.
|
||||
:returns: The main tree with added post branches.
|
||||
"""
|
||||
# Define the data to extract from the post.
|
||||
post_data = {
|
||||
# "Author": post["data"]["author"],
|
||||
f"{glyph.id_button} ID": post["data"]["id"],
|
||||
f"{glyph.people_hugging} Subreddit": post["data"]["subreddit_name_prefixed"],
|
||||
f"{glyph.face_with_peeking_eye} Visibility": post["data"]["subreddit_type"],
|
||||
f"{glyph.framed_picture} Thumbnail": post["data"]["thumbnail"],
|
||||
f"{glyph.white_question_mark} Gilded": post["data"]["gilded"],
|
||||
f"{glyph.up_arrow} Upvotes": post["data"]["ups"],
|
||||
f"{glyph.chart_increasing} Upvote ratio": post["data"]["upvote_ratio"],
|
||||
f"{glyph.down_arrow} Downvotes": post["data"]["downs"],
|
||||
f"{glyph.trophy} Awards": post["data"]["total_awards_received"],
|
||||
f"{glyph.trophy} Top award": post["data"]["top_awarded_type"],
|
||||
f"{glyph.no_one_under_eighteen} Is NSFW?": post["data"]["over_18"],
|
||||
f"{glyph.left_arrow_curving_right} Is crosspostable?": post["data"][
|
||||
"is_crosspostable"
|
||||
],
|
||||
f"{glyph.bar_chart} Score": post["data"]["score"],
|
||||
f"{glyph.card_file_box} Category": post["data"]["category"],
|
||||
f"{glyph.globe_with_meridians} Domain": post["data"]["domain"],
|
||||
f"{glyph.calendar} Posted on": convert_timestamp_to_datetime(
|
||||
post["data"]["created"]
|
||||
),
|
||||
f"{glyph.calendar} Approved at": post["data"]["approved_at_utc"],
|
||||
f"{glyph.bust_in_silhouette} Approved by": post["data"]["approved_by"],
|
||||
}
|
||||
|
||||
# Add the post's branch to the main tree.
|
||||
post_branch = tree.add(f"{glyph.scroll} {post['data']['title']}")
|
||||
|
||||
# Add each piece of extracted data as a branch of the post_branch.
|
||||
for post_key, post_value in post_data.items():
|
||||
post_branch.add(f"{post_key}: {post_value}", style="dim")
|
||||
|
||||
# This ensures that the post's selftext is also added to the written json/csv file.
|
||||
post_data[f"{glyph.clipboard} Text"] = post["data"]["selftext"]
|
||||
write_post_data(
|
||||
filename=keyword, post_data=post_data, tree_branch=post_branch, args=args
|
||||
)
|
||||
post_branch.add(post["data"]["selftext"], style="italic")
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
def get_posts(args: argparse):
|
||||
"""
|
||||
Scrapes a given subreddit for posts that contain a specified keyword.
|
||||
The search is limited by the number of posts and timeframe specified.
|
||||
|
||||
:param args: Namespace object from argparse.
|
||||
|
||||
Expected Object Attributes
|
||||
--------------------------
|
||||
- keyword: The keyword to search for in the posts.
|
||||
- subreddit: The subreddit to scrape.
|
||||
- listing: The type of posts to scrape. This could be 'hot', 'new', etc.
|
||||
- timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
|
||||
- limit: The maximum number of posts to scrape.
|
||||
- json: If specified, all found posts will be written to a json file.
|
||||
"""
|
||||
keyword = args.keyword
|
||||
subreddit = args.subreddit
|
||||
listing = args.listing
|
||||
timeframe = args.timeframe
|
||||
limit = args.limit
|
||||
|
||||
# Create main result tree.
|
||||
main_tree = Tree(
|
||||
f"[bold]{glyph.calendar} {datetime.now()}[/]", guide_style="bold bright_blue"
|
||||
)
|
||||
|
||||
# Start a new session
|
||||
session = requests.session()
|
||||
# Set the User-Agent to mimic a Safari browser on a Mac.
|
||||
session.headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, "
|
||||
"like Gecko) Version/14.1.1 Safari/605.1.15"
|
||||
}
|
||||
|
||||
# Send a GET request to the specified subreddit and listing,
|
||||
# limiting the response by the specified limit and timeframe.
|
||||
response = session.get(
|
||||
f"https://reddit.com/r/{subreddit}/{listing}"
|
||||
f".json?limit={limit}&t={timeframe}"
|
||||
).json()
|
||||
|
||||
# Initialize a counter for the number of posts found that contain the keyword.
|
||||
found_posts = 0
|
||||
|
||||
# Loop through each post in the response
|
||||
for post_index, post in enumerate(response["data"]["children"], start=1):
|
||||
# If the keyword is found in the post's selftext or title, increment the counter and process the post.
|
||||
if (
|
||||
keyword.lower() in post["data"]["selftext"]
|
||||
or keyword.lower() in post["data"]["title"]
|
||||
):
|
||||
# Create a branch for found post(s) and show post index and post author as the title
|
||||
found_tree = main_tree.add(
|
||||
f"{glyph.bust_in_silhouette} #{post_index} by [bold]@{post['data']['author']}[/]"
|
||||
)
|
||||
found_posts += 1
|
||||
create_post_branch(post=post, keyword=keyword, tree=found_tree, args=args)
|
||||
|
||||
# Log the number of posts in which the keyword was found
|
||||
main_tree.add(
|
||||
f"{glyph.check_mark_button} Keyword ('{keyword}') was found in "
|
||||
f"{found_posts}/{len(response['data']['children'])} {listing} posts from r/{subreddit}."
|
||||
)
|
||||
xprint(main_tree)
|
||||
202
rpst/utils.py
Normal file
202
rpst/utils.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import os
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from glyphoji import glyph
|
||||
from rich.tree import Tree
|
||||
from rich import print as xprint
|
||||
from rich.markdown import Markdown
|
||||
from rich.logging import RichHandler
|
||||
|
||||
|
||||
def convert_timestamp_to_datetime(timestamp: int) -> str:
|
||||
"""
|
||||
Converts a Unix timestamp to a formatted datetime string.
|
||||
|
||||
:param timestamp: The Unix timestamp to be converted.
|
||||
:return: A formatted datetime string in the format "dd MMMM yyyy, hh:mm:ssAM/PM".
|
||||
"""
|
||||
utc_from_timestamp = datetime.utcfromtimestamp(timestamp)
|
||||
datetime_object = utc_from_timestamp.strftime("%d %B %Y, %I:%M:%S%p")
|
||||
return datetime_object
|
||||
|
||||
|
||||
def create_parser():
|
||||
"""
|
||||
Creates and configures an argument parser for the command line arguments.
|
||||
|
||||
:return: A configured argparse.ArgumentParser object ready to parse the command line arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="RPST (Reddit Post Scraping Tool) —by Richard Mwewa | https://about.me/rly0nheart",
|
||||
epilog="Given a subreddit name and a keyword, "
|
||||
"RPST returns all top (by default) posts that contain the specified keyword.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-k", "--keyword", help="The keyword to search for in the posts.", required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s", "--subreddit", help="The subreddit to scrape.", required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--limit",
|
||||
help="The maximum number of posts to scrape (1-100). (default: %(default)s)",
|
||||
default=10,
|
||||
type=int,
|
||||
choices=range(
|
||||
1, 101
|
||||
), # This enforces that the limit must be between 1 and 100 inclusive.
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--listing",
|
||||
default="top",
|
||||
const="top",
|
||||
nargs="?",
|
||||
choices=["controversial", "hot", "best", "new", "rising"],
|
||||
help="The type of posts to scrape (default: %(default)s)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--timeframe",
|
||||
default="all",
|
||||
const="all",
|
||||
nargs="?",
|
||||
choices=["hour", "day", "week", "month", "year", "all"],
|
||||
help="The timeframe from which to scrape posts (default: %(default)s)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
help="Write all found posts to a json file.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
help="Write all found posts to a csv file.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--debug",
|
||||
help="run rpst in debug mode (show network logs)",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def check_updates(version_tag: str):
|
||||
"""
|
||||
This function checks if there's a new release of a project on GitHub. If there is, it logs an
|
||||
information message and prints the release notes.
|
||||
|
||||
:param version_tag: A string representing the current version of the project.
|
||||
"""
|
||||
|
||||
# Make a GET request to the GitHub API to get the latest release of the project.
|
||||
response = requests.get(
|
||||
"https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest"
|
||||
).json()
|
||||
|
||||
# Check if the latest release's tag matches the current version tag.
|
||||
if response["tag_name"] != version_tag:
|
||||
# If not, convert the release notes from Markdown to HTML.
|
||||
raw_release_notes = response["body"]
|
||||
|
||||
# Log an info message about the new release.
|
||||
xprint(
|
||||
f"{glyph.up_arrow} A new release of RPST is available ({response['tag_name']}). "
|
||||
f"Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates."
|
||||
)
|
||||
|
||||
# Print the release notes.
|
||||
xprint(Markdown(raw_release_notes))
|
||||
|
||||
|
||||
def set_loglevel(args: argparse) -> logging.getLogger:
|
||||
"""
|
||||
Configures the logging level based on the provided arguments.
|
||||
|
||||
If `args.debug` is True, the logging level is set to "NOTSET," allowing all log messages to be displayed.
|
||||
Otherwise, the logging level is set to "INFO," and only informational and higher-severity messages are displayed.
|
||||
|
||||
The function also configures a RichHandler for formatting the log messages,
|
||||
including a specific time format and hiding the log level.
|
||||
|
||||
:param args: A namespace object from argparse containing the debugging option (args.debug).
|
||||
:return: A logger object associated with the name "rich."
|
||||
"""
|
||||
if args.debug:
|
||||
logging.basicConfig(
|
||||
level="NOTSET",
|
||||
format="%(message)s",
|
||||
handlers=[
|
||||
RichHandler(
|
||||
markup=True, log_time_format="[%H:%M:%S%p]", show_level=False
|
||||
)
|
||||
],
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
level="INFO",
|
||||
format="%(message)s",
|
||||
handlers=[
|
||||
RichHandler(
|
||||
markup=True, log_time_format="[%H:%M:%S%p]", show_level=False
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
return logging.getLogger("rich")
|
||||
|
||||
|
||||
def write_post_data(post_data: dict, filename: str, args, tree_branch: Tree):
|
||||
"""
|
||||
Writes post data to a specified JSON or CSV file based on the args provided, and updates
|
||||
the provided tree with the status.
|
||||
|
||||
:param post_data: A dictionary containing post data.
|
||||
:param filename: The name of the file to which post data will be written.
|
||||
:param args: A namespace object from argparse containing the output format options (args.json or args.csv).
|
||||
:param tree_branch: A rich Tree object to which status information will be added.
|
||||
"""
|
||||
home_directory = os.path.expanduser("~")
|
||||
|
||||
if args.json:
|
||||
json_file_path = os.path.join(home_directory, f"{filename}.json")
|
||||
with open(json_file_path, "a", encoding="utf-8") as file:
|
||||
file.write(json.dumps(post_data, ensure_ascii=False))
|
||||
file.write("\n") # Separate posts with newline
|
||||
tree_branch.add(
|
||||
f"{glyph.page_facing_up} JSON data successfully written/appended to file: "
|
||||
f"[italic][link file://{json_file_path}]{json_file_path}[/]"
|
||||
)
|
||||
else:
|
||||
tree_branch.add(
|
||||
f"{glyph.cross_mark_button} JSON data writing operation was skipped. No changes made."
|
||||
)
|
||||
|
||||
if args.csv:
|
||||
csv_file_path = os.path.join(home_directory, f"{filename}.csv")
|
||||
with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=post_data.keys())
|
||||
|
||||
# Write headers if file is empty
|
||||
if csvfile.tell() == 0:
|
||||
writer.writeheader()
|
||||
|
||||
writer.writerow(post_data)
|
||||
tree_branch.add(
|
||||
f"{glyph.page_facing_up} CSV data successfully written/appended to file: "
|
||||
f"[italic][link file://{csv_file_path}]{csv_file_path}[/]"
|
||||
)
|
||||
else:
|
||||
tree_branch.add(
|
||||
f"{glyph.cross_mark_button} CSV data writing operation was skipped. No changes made."
|
||||
)
|
||||
Reference in New Issue
Block a user