From d26699cc1f17172fe6b622d782f6cc569f62f549 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Sat, 12 Aug 2023 05:24:35 +0200
Subject: [PATCH 01/15] Update README.md
---
README.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index c02ddb0..fefcda6 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
# RPST (Reddit Post Scraping Tool)
Given a subreddit name and a keyword, RPST will return all posts from a specified listing (default is 'top') that contain the provided keyword.
-[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
+[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  


***
@@ -14,7 +14,7 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
## CLI
- [x] Saves results to a JSON (-j/--json)
-- [x] Automatically checks for new updates. Notifies user if update were found.
+- [x] Automatically checks for new updates. Notifies user if updates were found.
# 📃 TODO
## GUI
@@ -23,7 +23,7 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
- [ ] Make it save results to a CSV file
# 📖 Wiki
-[Refer to the Wiki](https://github.com/rly0nheart/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
+[Refer to the Wiki](https://github.com/bellingcat/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
# 😁 Donations
If you like `RPST` and would like to show support, you can Buy A Coffee for the developer using the button below
From 418b2acc4cbee73f03b34e9a7cd514ea7750bb35 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Sat, 12 Aug 2023 05:25:07 +0200
Subject: [PATCH 02/15] Update README.md
---
RPST GUI/RPST/README.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/RPST GUI/RPST/README.md b/RPST GUI/RPST/README.md
index c2d5d86..b36ded7 100644
--- a/RPST GUI/RPST/README.md
+++ b/RPST GUI/RPST/README.md
@@ -1,7 +1,7 @@
# RPST (Reddit Post Scraping Tool)
Given a subreddit name and a keyword, RPST will return all posts from a specified listing (default is 'top') that contain the provided keyword.
-[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
+[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  


***
@@ -14,7 +14,7 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
## CLI
- [x] Saves results to a JSON (-j/--json)
-- [x] Automatically checks for new updates. Notifies user if update were found.
+- [x] Automatically checks for new updates. Notifies user if updates were found.
# 📃 TODO
## GUI
@@ -23,7 +23,7 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
- [ ] Make it save results to a CSV file
# 📖 Wiki
-[Refer to the Wiki](https://github.com/rly0nheart/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
+[Refer to the Wiki](https://github.com/bellingcat/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
# 😁 Donations
If you like `RPST` and would like to show support, you can Buy A Coffee for the developer using the button below
From cce254e9767d8957083054a9f39be080d18f5d47 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Mon, 14 Aug 2023 02:51:49 +0200
Subject: [PATCH 03/15] Update pyproject.toml
---
pyproject.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index cdc299d..08f4aec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ classifiers = [
"Development Status :: 5 - Production/Stable",
"Programming Language :: Python :: 3",
"Programming Language :: Visual Basic",
- "Intended Audience :: Information Technology",
+ "Intended Audience :: End Users/Desktop",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Natural Language :: English"
From 3a9a87e67c467be14aabc17676ee032565136a80 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 14:51:16 +0200
Subject: [PATCH 04/15] Update and rename __rpst.py to rpst.py
1.7.0.0
---
rpst/__rpst.py | 263 -------------------------------------------------
rpst/rpst.py | 131 ++++++++++++++++++++++++
2 files changed, 131 insertions(+), 263 deletions(-)
delete mode 100644 rpst/__rpst.py
create mode 100644 rpst/rpst.py
diff --git a/rpst/__rpst.py b/rpst/__rpst.py
deleted file mode 100644
index 15daae7..0000000
--- a/rpst/__rpst.py
+++ /dev/null
@@ -1,263 +0,0 @@
-import os
-import json
-import logging
-import argparse
-from datetime import datetime
-
-import requests
-from rich.tree import Tree
-from rich import print as xprint
-from rich.markdown import Markdown
-from rich.logging import RichHandler
-
-
-def convert_timestamp_to_datetime(timestamp: int) -> str:
- """
- Converts a Unix timestamp to a formatted datetime string.
-
- :param timestamp: The Unix timestamp to be converted.
- :return: A formatted datetime string in the format "dd MMMM yyyy, hh:mm:ssAM/PM".
- """
- utc_from_timestamp = datetime.utcfromtimestamp(timestamp)
- datetime_object = utc_from_timestamp.strftime("%d %B %Y, %I:%M:%S%p")
- return datetime_object
-
-
-def write_post_data(post_data: dict, filename: str) -> str:
- """
- Writes post data to a specified JSON file.
-
- :param post_data: A dictionary containing post data.
- :param filename: The name of the file to which post data will be written.
- :returns: A string representation of the file path.
- """
- home_directory = os.path.expanduser("~")
- file_path = os.path.join(home_directory, f"{filename}.json")
-
- # Write the data to a JSON file
- with open(file_path, "a") as file:
- file.write(json.dumps(post_data))
- file.write("\n") # write a newline to separate posts.
-
- return file.name
-
-
-def check_updates(version_tag: str):
- """
- This function checks if there's a new release of a project on GitHub. If there is, it logs an
- information message and prints the release notes.
-
- :param version_tag: A string representing the current version of the project.
- """
-
- # Make a GET request to the GitHub API to get the latest release of the project.
- response = requests.get(
- "https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest"
- ).json()
-
- # Check if the latest release's tag matches the current version tag.
- if response["tag_name"] != version_tag:
- # If not, convert the release notes from Markdown to HTML.
- raw_release_notes = response["body"]
- markdown_release_notes = Markdown(raw_release_notes)
-
- # Log an info message about the new release.
- log.info(
- f"A new release of RPST is available ({response['tag_name']}). "
- f"Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates."
- )
-
- # Print the release notes.
- xprint(markdown_release_notes)
-
-
-def create_post_branch(post: dict, keyword: str, output: bool, tree: Tree) -> Tree:
- """
- This function extracts relevant data from a Reddit post and adds it in a tree branch structure,
- followed by the post's selftext.
-
- :param post: A dictionary containing the data of a Reddit post.
- :param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
- :param output: If specified, all found posts will be written to a json file.
- :param tree: Tree where the post branch will be added.
- :returns: The main tree with added post branches.
- """
- # Define the data to extract from the post.
- post_data = {
- # "Author": post["data"]["author"],
- "ID": post["data"]["id"],
- "Subreddit": post["data"]["subreddit_name_prefixed"],
- "Visibility": post["data"]["subreddit_type"],
- "Thumbnail": post["data"]["thumbnail"],
- "Gilded": post["data"]["gilded"],
- "Upvotes": post["data"]["ups"],
- "Upvote ratio": post["data"]["upvote_ratio"],
- "Downvotes": post["data"]["downs"],
- "Awards": post["data"]["total_awards_received"],
- "Top award": post["data"]["top_awarded_type"],
- "Is NSFW?": post["data"]["over_18"],
- "Is crosspostable?": post["data"]["is_crosspostable"],
- "Score": post["data"]["score"],
- "Category": post["data"]["category"],
- "Domain": post["data"]["domain"],
- "Posted on": convert_timestamp_to_datetime(post["data"]["created"]),
- "Approved at": post["data"]["approved_at_utc"],
- "Approved by": post["data"]["approved_by"],
- }
-
- # Add the post's branch to the main tree.
- post_branch = tree.add(f":scroll: {post['data']['title']}")
-
- # Add each piece of extracted data as a branch of the post_branch.
- for post_key, post_value in post_data.items():
- post_branch.add(f"{post_key}: {post_value}", style="dim")
-
- # If -j/--json is passed, write found posts to a json file.
- if output:
- # This ensures that the post's selftext is also added to the written json file.
- post_data["Text"] = post["data"]["selftext"]
- output_file = write_post_data(filename=keyword, post_data=post_data)
- tree.add(
- f":page_facing_up: Post data written/appended to "
- f"[italic][link file://{output_file}]{output_file}[/]"
- )
- post_branch.add(post["data"]["selftext"], style="italic")
-
- return tree
-
-
-def get_posts(arguments: argparse):
- """
- Scrapes a given subreddit for posts that contain a specified keyword.
- The search is limited by the number of posts and timeframe specified.
-
- :param arguments: Namespace object from argparse.
-
- Expected Object Attributes
- --------------------------
- - keyword: The keyword to search for in the posts.
- - subreddit: The subreddit to scrape.
- - listing: The type of posts to scrape. This could be 'hot', 'new', etc.
- - timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
- - limit: The maximum number of posts to scrape.
- - json: If specified, all found posts will be written to a json file.
- """
- keyword = arguments.keyword
- subreddit = arguments.subreddit
- listing = arguments.listing
- timeframe = arguments.timeframe
- limit = arguments.limit
- json_output = arguments.json
-
- # Create main result tree.
- main_tree = Tree(f"[bold]{datetime.now()}[/]", guide_style="bold bright_blue")
-
- # Start a new session
- session = requests.session()
- # Set the User-Agent to mimic a Safari browser on a Mac.
- session.headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, "
- "like Gecko) Version/14.1.1 Safari/605.1.15"
- }
-
- # Send a GET request to the specified subreddit and listing,
- # limiting the response by the specified limit and timeframe.
- response = session.get(
- f"https://reddit.com/r/{subreddit}/{listing}"
- f".json?limit={limit}&t={timeframe}"
- ).json()
-
- # Initialize a counter for the number of posts found that contain the keyword.
- found_posts = 0
-
- # Loop through each post in the response
- for post_index, post in enumerate(response["data"]["children"], start=1):
- # If the keyword is found in the post's selftext or title, increment the counter and process the post.
- if (
- keyword.lower() in post["data"]["selftext"]
- or keyword.lower() in post["data"]["title"]
- ):
- # Create a branch for found post(s) and show post index and post author as the title
- found_tree = main_tree.add(
- f":bust_in_silhouette: #{post_index} by [bold]@{post['data']['author']}[/]"
- )
- found_posts += 1
- create_post_branch(
- post=post,
- keyword=keyword,
- output=json_output,
- tree=found_tree,
- )
-
- # Log the number of posts in which the keyword was found
- main_tree.add(
- f"Keyword ('{keyword}') was found in {found_posts}/{len(response['data']['children'])} "
- f"{listing} posts from r/{subreddit}."
- )
- xprint(main_tree)
-
-
-def create_parser():
- """
- Creates and configures an argument parser for the command line arguments.
-
- :return: A configured argparse.ArgumentParser object ready to parse the command line arguments.
- """
- parser = argparse.ArgumentParser(
- description="RPST (Reddit Post Scraping Tool) —by Richard Mwewa | https://about.me/rly0nheart",
- epilog="Given a subreddit name and a keyword, "
- "RPST returns all top (by default) posts that contain the specified keyword.",
- )
-
- parser.add_argument(
- "-k", "--keyword", help="The keyword to search for in the posts.", required=True
- )
- parser.add_argument(
- "-s", "--subreddit", help="The subreddit to scrape.", required=True
- )
- parser.add_argument(
- "-c",
- "--limit",
- help="The maximum number of posts to scrape (1-100). (default: %(default)s)",
- default=10,
- type=int,
- choices=range(
- 1, 101
- ), # This enforces that the limit must be between 1 and 100 inclusive.
- )
- parser.add_argument(
- "-l",
- "--listing",
- default="top",
- const="top",
- nargs="?",
- choices=["controversial", "hot", "best", "new", "rising"],
- help="The type of posts to scrape (default: %(default)s)",
- )
- parser.add_argument(
- "-t",
- "--timeframe",
- default="all",
- const="all",
- nargs="?",
- choices=["hour", "day", "week", "month", "year", "all"],
- help="The timeframe from which to scrape posts (default: %(default)s)",
- )
- parser.add_argument(
- "-j",
- "--json",
- help="Write all found posts to a json file.",
- action="store_true",
- )
-
- return parser
-
-
-logging.basicConfig(
- level="NOTSET",
- format="%(message)s",
- handlers=[
- RichHandler(markup=True, log_time_format="[%H:%M:%S%p]", show_level=False)
- ],
-)
-log = logging.getLogger("rich")
diff --git a/rpst/rpst.py b/rpst/rpst.py
new file mode 100644
index 0000000..18b03a7
--- /dev/null
+++ b/rpst/rpst.py
@@ -0,0 +1,131 @@
+import argparse
+from datetime import datetime
+
+import requests
+from glyphoji import glyph
+from rich.tree import Tree
+from rich import print as xprint
+
+from .utils import convert_timestamp_to_datetime, write_post_data
+
+
+def create_post_branch(post: dict, keyword: str, tree: Tree, args: argparse) -> Tree:
+ """
+ This function extracts relevant data from a Reddit post and adds it in a tree branch structure,
+ followed by the post's selftext.
+
+ :param post: A dictionary containing the data of a Reddit post.
+ :param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
+ :param tree: Tree where the post branch will be added.
+ :param args: A namespace object from argparse.
+ :returns: The main tree with added post branches.
+ """
+ # Define the data to extract from the post.
+ post_data = {
+ # "Author": post["data"]["author"],
+ f"{glyph.id_button} ID": post["data"]["id"],
+ f"{glyph.people_hugging} Subreddit": post["data"]["subreddit_name_prefixed"],
+ f"{glyph.face_with_peeking_eye} Visibility": post["data"]["subreddit_type"],
+ f"{glyph.framed_picture} Thumbnail": post["data"]["thumbnail"],
+ f"{glyph.white_question_mark} Gilded": post["data"]["gilded"],
+ f"{glyph.up_arrow} Upvotes": post["data"]["ups"],
+ f"{glyph.chart_increasing} Upvote ratio": post["data"]["upvote_ratio"],
+ f"{glyph.down_arrow} Downvotes": post["data"]["downs"],
+ f"{glyph.trophy} Awards": post["data"]["total_awards_received"],
+ f"{glyph.trophy} Top award": post["data"]["top_awarded_type"],
+ f"{glyph.no_one_under_eighteen} Is NSFW?": post["data"]["over_18"],
+ f"{glyph.left_arrow_curving_right} Is crosspostable?": post["data"][
+ "is_crosspostable"
+ ],
+ f"{glyph.bar_chart} Score": post["data"]["score"],
+ f"{glyph.card_file_box} Category": post["data"]["category"],
+ f"{glyph.globe_with_meridians} Domain": post["data"]["domain"],
+ f"{glyph.calendar} Posted on": convert_timestamp_to_datetime(
+ post["data"]["created"]
+ ),
+ f"{glyph.calendar} Approved at": post["data"]["approved_at_utc"],
+ f"{glyph.bust_in_silhouette} Approved by": post["data"]["approved_by"],
+ }
+
+ # Add the post's branch to the main tree.
+ post_branch = tree.add(f"{glyph.scroll} {post['data']['title']}")
+
+ # Add each piece of extracted data as a branch of the post_branch.
+ for post_key, post_value in post_data.items():
+ post_branch.add(f"{post_key}: {post_value}", style="dim")
+
+ # This ensures that the post's selftext is also added to the written json/csv file.
+ post_data[f"{glyph.clipboard} Text"] = post["data"]["selftext"]
+ write_post_data(
+ filename=keyword, post_data=post_data, tree_branch=post_branch, args=args
+ )
+ post_branch.add(post["data"]["selftext"], style="italic")
+
+ return tree
+
+
+def get_posts(args: argparse):
+ """
+ Scrapes a given subreddit for posts that contain a specified keyword.
+ The search is limited by the number of posts and timeframe specified.
+
+ :param args: Namespace object from argparse.
+
+ Expected Object Attributes
+ --------------------------
+ - keyword: The keyword to search for in the posts.
+ - subreddit: The subreddit to scrape.
+ - listing: The type of posts to scrape. This could be 'hot', 'new', etc.
+ - timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
+ - limit: The maximum number of posts to scrape.
+ - json: If specified, all found posts will be written to a json file.
+ """
+ keyword = args.keyword
+ subreddit = args.subreddit
+ listing = args.listing
+ timeframe = args.timeframe
+ limit = args.limit
+
+ # Create main result tree.
+ main_tree = Tree(
+ f"[bold]{glyph.calendar} {datetime.now()}[/]", guide_style="bold bright_blue"
+ )
+
+ # Start a new session
+ session = requests.session()
+ # Set the User-Agent to mimic a Safari browser on a Mac.
+ session.headers = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, "
+ "like Gecko) Version/14.1.1 Safari/605.1.15"
+ }
+
+ # Send a GET request to the specified subreddit and listing,
+ # limiting the response by the specified limit and timeframe.
+ response = session.get(
+ f"https://reddit.com/r/{subreddit}/{listing}"
+ f".json?limit={limit}&t={timeframe}"
+ ).json()
+
+ # Initialize a counter for the number of posts found that contain the keyword.
+ found_posts = 0
+
+ # Loop through each post in the response
+ for post_index, post in enumerate(response["data"]["children"], start=1):
+ # If the keyword is found in the post's selftext or title, increment the counter and process the post.
+ if (
+ keyword.lower() in post["data"]["selftext"]
+ or keyword.lower() in post["data"]["title"]
+ ):
+ # Create a branch for found post(s) and show post index and post author as the title
+ found_tree = main_tree.add(
+ f"{glyph.bust_in_silhouette} #{post_index} by [bold]@{post['data']['author']}[/]"
+ )
+ found_posts += 1
+ create_post_branch(post=post, keyword=keyword, tree=found_tree, args=args)
+
+ # Log the number of posts in which the keyword was found
+ main_tree.add(
+ f"{glyph.check_mark_button} Keyword ('{keyword}') was found in "
+ f"{found_posts}/{len(response['data']['children'])} {listing} posts from r/{subreddit}."
+ )
+ xprint(main_tree)
From f117c99cc7a05e9edc537c93261d7583c7a82d1e Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 14:52:27 +0200
Subject: [PATCH 05/15] Update and rename __main.py to main.py
1.7.0.0
---
rpst/{__main.py => main.py} | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
rename rpst/{__main.py => main.py} (66%)
diff --git a/rpst/__main.py b/rpst/main.py
similarity index 66%
rename from rpst/__main.py
rename to rpst/main.py
index eba61ae..458a734 100644
--- a/rpst/__main.py
+++ b/rpst/main.py
@@ -1,5 +1,7 @@
from datetime import datetime
-from rpst.__rpst import log, get_posts, check_updates, create_parser
+
+from .rpst import get_posts
+from .utils import create_parser, set_loglevel, check_updates
def run():
@@ -10,20 +12,22 @@ def run():
# Create a parser and parse the command line arguments
parser = create_parser()
- arguments = parser.parse_args()
+ args = parser.parse_args()
+
+ log = set_loglevel(args=args)
# Record the start time
start_time = datetime.now()
try:
# Check for updates
- check_updates(version_tag="1.6.2.0")
+ check_updates(version_tag="1.7.0.0")
# Get posts with the provided/parsed arguments
- get_posts(arguments=arguments)
+ get_posts(args=args)
except KeyboardInterrupt:
log.warning("User interruption detected.")
except Exception as e:
log.error(f"An error occurred: {e}")
finally:
- log.info(f'Finished in {datetime.now() - start_time} seconds.')
+ log.info(f"Finished in {datetime.now() - start_time} seconds.")
From 8f259b7a40440176ce894176911414105cfc19bc Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 14:54:31 +0200
Subject: [PATCH 06/15] Update pyproject.toml
1.7.0.0
---
pyproject.toml | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 08f4aec..2fef997 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,12 +7,12 @@ packages = ["rpst"]
[project]
name = "reddit-post-scraping-tool"
-version = "1.6.2.0"
+version = "1.7.0.0"
description = "Given a subreddit name and a keyword, RPST returns all top (by default) posts that contain the specified keyword."
readme = "README.md"
requires-python = ">=3.8"
license = {file = "LICENSE"}
-keywords = ["osint", "reddit-crawler", "reddit-scraping", "reddit"]
+keywords = ["reddit-crawler", "reddit-scraping", "reddit", "reddit-api"]
authors = [{name = "Richard Mwewa", email = "rly0nheart@duck.com"}]
classifiers = [
"Development Status :: 5 - Production/Stable",
@@ -26,6 +26,7 @@ classifiers = [
dependencies = [
"rich",
+ "glyphoji",
"requests",
]
@@ -35,4 +36,4 @@ documentation = "https://github.com/bellingcat/reddit-post-scraping-tool/wiki"
repository = "https://github.com/bellingcat/reddit-post-scraping-tool.git"
[project.scripts]
-rpst = "rpst.__main:run"
+rpst = "rpst.main:run"
From b08c4a147b6aa55cd07a8134dea97ccd3afe4c58 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:04:37 +0200
Subject: [PATCH 07/15] Create utils.py
---
rpst/utils.py | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 202 insertions(+)
create mode 100644 rpst/utils.py
diff --git a/rpst/utils.py b/rpst/utils.py
new file mode 100644
index 0000000..2788b81
--- /dev/null
+++ b/rpst/utils.py
@@ -0,0 +1,202 @@
+import os
+import csv
+import json
+import logging
+import argparse
+from datetime import datetime
+
+import requests
+from glyphoji import glyph
+from rich.tree import Tree
+from rich import print as xprint
+from rich.markdown import Markdown
+from rich.logging import RichHandler
+
+
+def convert_timestamp_to_datetime(timestamp: int) -> str:
+ """
+ Converts a Unix timestamp to a formatted datetime string.
+
+ :param timestamp: The Unix timestamp to be converted.
+ :return: A formatted datetime string in the format "dd MMMM yyyy, hh:mm:ssAM/PM".
+ """
+ utc_from_timestamp = datetime.utcfromtimestamp(timestamp)
+ datetime_object = utc_from_timestamp.strftime("%d %B %Y, %I:%M:%S%p")
+ return datetime_object
+
+
+def create_parser():
+ """
+ Creates and configures an argument parser for the command line arguments.
+
+ :return: A configured argparse.ArgumentParser object ready to parse the command line arguments.
+ """
+ parser = argparse.ArgumentParser(
+ description="RPST (Reddit Post Scraping Tool) —by Richard Mwewa | https://about.me/rly0nheart",
+ epilog="Given a subreddit name and a keyword, "
+ "RPST returns all top (by default) posts that contain the specified keyword.",
+ )
+
+ parser.add_argument(
+ "-k", "--keyword", help="The keyword to search for in the posts.", required=True
+ )
+ parser.add_argument(
+ "-s", "--subreddit", help="The subreddit to scrape.", required=True
+ )
+ parser.add_argument(
+ "-c",
+ "--limit",
+ help="The maximum number of posts to scrape (1-100). (default: %(default)s)",
+ default=10,
+ type=int,
+ choices=range(
+ 1, 101
+ ), # This enforces that the limit must be between 1 and 100 inclusive.
+ )
+ parser.add_argument(
+ "-l",
+ "--listing",
+ default="top",
+ const="top",
+ nargs="?",
+ choices=["controversial", "hot", "best", "new", "rising"],
+ help="The type of posts to scrape (default: %(default)s)",
+ )
+ parser.add_argument(
+ "-t",
+ "--timeframe",
+ default="all",
+ const="all",
+ nargs="?",
+ choices=["hour", "day", "week", "month", "year", "all"],
+ help="The timeframe from which to scrape posts (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--json",
+ help="Write all found posts to a json file.",
+ action="store_true",
+ )
+ parser.add_argument(
+ "--csv",
+ help="Write all found posts to a csv file.",
+ action="store_true",
+ )
+ parser.add_argument(
+ "-d",
+ "--debug",
+ help="run rpst in debug mode (show network logs)",
+ action="store_true",
+ )
+
+ return parser
+
+
+def check_updates(version_tag: str):
+ """
+ This function checks if there's a new release of a project on GitHub. If there is, it logs an
+ information message and prints the release notes.
+
+ :param version_tag: A string representing the current version of the project.
+ """
+
+ # Make a GET request to the GitHub API to get the latest release of the project.
+ response = requests.get(
+ "https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest"
+ ).json()
+
+ # Check if the latest release's tag matches the current version tag.
+ if response["tag_name"] != version_tag:
+ # If not, convert the release notes from Markdown to HTML.
+ raw_release_notes = response["body"]
+
+ # Log an info message about the new release.
+ xprint(
+ f"{glyph.up_arrow} A new release of RPST is available ({response['tag_name']}). "
+ f"Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates."
+ )
+
+ # Print the release notes.
+ xprint(Markdown(raw_release_notes))
+
+
+def set_loglevel(args: argparse) -> logging.getLogger:
+ """
+ Configures the logging level based on the provided arguments.
+
+ If `args.debug` is True, the logging level is set to "NOTSET," allowing all log messages to be displayed.
+ Otherwise, the logging level is set to "INFO," and only informational and higher-severity messages are displayed.
+
+ The function also configures a RichHandler for formatting the log messages,
+ including a specific time format and hiding the log level.
+
+ :param args: A namespace object from argparse containing the debugging option (args.debug).
+ :return: A logger object associated with the name "rich."
+ """
+ if args.debug:
+ logging.basicConfig(
+ level="NOTSET",
+ format="%(message)s",
+ handlers=[
+ RichHandler(
+ markup=True, log_time_format="[%H:%M:%S%p]", show_level=False
+ )
+ ],
+ )
+ else:
+ logging.basicConfig(
+ level="INFO",
+ format="%(message)s",
+ handlers=[
+ RichHandler(
+ markup=True, log_time_format="[%H:%M:%S%p]", show_level=False
+ )
+ ],
+ )
+
+ return logging.getLogger("rich")
+
+
+def write_post_data(post_data: dict, filename: str, args, tree_branch: Tree):
+ """
+ Writes post data to a specified JSON or CSV file based on the args provided, and updates
+ the provided tree with the status.
+
+ :param post_data: A dictionary containing post data.
+ :param filename: The name of the file to which post data will be written.
+ :param args: A namespace object from argparse containing the output format options (args.json or args.csv).
+ :param tree_branch: A rich Tree object to which status information will be added.
+ """
+ home_directory = os.path.expanduser("~")
+
+ if args.json:
+ json_file_path = os.path.join(home_directory, f"{filename}.json")
+ with open(json_file_path, "a", encoding="utf-8") as file:
+ file.write(json.dumps(post_data, ensure_ascii=False))
+ file.write("\n") # Separate posts with newline
+ tree_branch.add(
+ f"{glyph.page_facing_up} JSON data successfully written/appended to file: "
+ f"[italic][link file://{json_file_path}]{json_file_path}[/]"
+ )
+ else:
+ tree_branch.add(
+ f"{glyph.cross_mark_button} JSON data writing operation was skipped. No changes made."
+ )
+
+ if args.csv:
+ csv_file_path = os.path.join(home_directory, f"{filename}.csv")
+ with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
+ writer = csv.DictWriter(csvfile, fieldnames=post_data.keys())
+
+ # Write headers if file is empty
+ if csvfile.tell() == 0:
+ writer.writeheader()
+
+ writer.writerow(post_data)
+ tree_branch.add(
+ f"{glyph.page_facing_up} CSV data successfully written/appended to file: "
+ f"[italic][link file://{csv_file_path}]{csv_file_path}[/]"
+ )
+ else:
+ tree_branch.add(
+ f"{glyph.cross_mark_button} CSV data writing operation was skipped. No changes made."
+ )
From 7c164938c98f98fdade2ab11e8e3e748fb8f6e03 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:06:31 +0200
Subject: [PATCH 08/15] Update RPST.vbproj
---
RPST GUI/RPST/RPST.vbproj | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/RPST GUI/RPST/RPST.vbproj b/RPST GUI/RPST/RPST.vbproj
index 1062cc1..bbde351 100644
--- a/RPST GUI/RPST/RPST.vbproj
+++ b/RPST GUI/RPST/RPST.vbproj
@@ -13,11 +13,11 @@
https://github.com/bellingcat/reddit-post-scraping-tool
README.md
https://github.com/bellingcat/reddit-post-scraping-tool
- 1.6.2.0
- 1.6.2.0
+ 1.7.0.0
+ 1.7.0.0
LICENSE
True
- 1.6.2
+ 1.7.0
reddit;scraper;reddit-scraper;osint
6.0-recommended
@@ -39,7 +39,7 @@
-
+
@@ -78,4 +78,4 @@
-
\ No newline at end of file
+
From c3e5ce6441e732a667d30d4ba7a58127ce09f619 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:26:42 +0200
Subject: [PATCH 09/15] Update rpst.py
---
rpst/rpst.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/rpst/rpst.py b/rpst/rpst.py
index 18b03a7..7b733a7 100644
--- a/rpst/rpst.py
+++ b/rpst/rpst.py
@@ -27,7 +27,7 @@ def create_post_branch(post: dict, keyword: str, tree: Tree, args: argparse) ->
f"{glyph.people_hugging} Subreddit": post["data"]["subreddit_name_prefixed"],
f"{glyph.face_with_peeking_eye} Visibility": post["data"]["subreddit_type"],
f"{glyph.framed_picture} Thumbnail": post["data"]["thumbnail"],
- f"{glyph.white_question_mark} Gilded": post["data"]["gilded"],
+ f"{glyph.white_question_mark} Gilded": post["data"]["gilded"],
f"{glyph.up_arrow} Upvotes": post["data"]["ups"],
f"{glyph.chart_increasing} Upvote ratio": post["data"]["upvote_ratio"],
f"{glyph.down_arrow} Downvotes": post["data"]["downs"],
@@ -125,7 +125,7 @@ def get_posts(args: argparse):
# Log the number of posts in which the keyword was found
main_tree.add(
- f"{glyph.check_mark_button} Keyword ('{keyword}') was found in "
+ f"{glyph.check_mark_button} Keyword ('{keyword}') was found in "
f"{found_posts}/{len(response['data']['children'])} {listing} posts from r/{subreddit}."
)
xprint(main_tree)
From 566f5587206885acef0cfad8998a4b11090c4a76 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:27:18 +0200
Subject: [PATCH 10/15] Update utils.py
---
rpst/utils.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/rpst/utils.py b/rpst/utils.py
index 2788b81..27fec42 100644
--- a/rpst/utils.py
+++ b/rpst/utils.py
@@ -179,7 +179,7 @@ def write_post_data(post_data: dict, filename: str, args, tree_branch: Tree):
)
else:
tree_branch.add(
- f"{glyph.cross_mark_button} JSON data writing operation was skipped. No changes made."
+ f"{glyph.cross_mark_button} JSON data writing operation was skipped. No changes made."
)
if args.csv:
@@ -198,5 +198,5 @@ def write_post_data(post_data: dict, filename: str, args, tree_branch: Tree):
)
else:
tree_branch.add(
- f"{glyph.cross_mark_button} CSV data writing operation was skipped. No changes made."
+ f"{glyph.cross_mark_button} CSV data writing operation was skipped. No changes made."
)
From b5b7df868e237531e02f28ca8c9365e45d4e42c6 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:44:13 +0200
Subject: [PATCH 11/15] Update README.md
---
README.md | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index fefcda6..fd301b5 100644
--- a/README.md
+++ b/README.md
@@ -2,19 +2,17 @@
Given a subreddit name and a keyword, RPST will return all posts from a specified listing (default is 'top') that contain the provided keyword.
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
-
-
-***
# ✅ Features
## GUI
-- [x] Dark mode (Right-click)
-- [x] Saves results to a JSON (Right-click)
+- [x] Dark mode (*Right-click*)
+- [x] Saves results to a JSON file (*Right-click*)
- [x] Logs errors to a file
## CLI
-- [x] Saves results to a JSON (-j/--json)
-- [x] Automatically checks for new updates. Notifies user if updates were found.
+- [x] Saves results to JSON (*specifiy* `--json`)
+- [x] Saves results to CSV (*specify* `--csv`)
+- [x] Automatically checks for new updates, and notifies user if updates were found.
# 📃 TODO
## GUI
@@ -22,6 +20,19 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
- [x] Add manual dark mode option, that will be persistent in all sessions
- [ ] Make it save results to a CSV file
+# Images
+## GUI
+* 
+* 
+
+## CLI
+* 
+* 
+* 
+
+
+
+
# 📖 Wiki
[Refer to the Wiki](https://github.com/bellingcat/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
From b31c38f5ccae948bb4f39961f577d879f900fde0 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:48:19 +0200
Subject: [PATCH 12/15] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index fd301b5..3978328 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
## CLI
* 
-* 
+* 
* 
From b0a8d75d8c2d315c8d2259b9938f1a850df56749 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:49:07 +0200
Subject: [PATCH 13/15] Update README.md
---
RPST GUI/RPST/README.md | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/RPST GUI/RPST/README.md b/RPST GUI/RPST/README.md
index b36ded7..72e375b 100644
--- a/RPST GUI/RPST/README.md
+++ b/RPST GUI/RPST/README.md
@@ -2,19 +2,17 @@
Given a subreddit name and a keyword, RPST will return all posts from a specified listing (default is 'top') that contain the provided keyword.
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
-
-
-***
# ✅ Features
## GUI
-- [x] Dark mode (Right-click)
-- [x] Saves results to a JSON (Right-click)
+- [x] Dark mode (*Right-click*)
+- [x] Saves results to a JSON file (*Right-click*)
- [x] Logs errors to a file
## CLI
-- [x] Saves results to a JSON (-j/--json)
-- [x] Automatically checks for new updates. Notifies user if updates were found.
+- [x] Saves results to JSON (*specifiy* `--json`)
+- [x] Saves results to CSV (*specify* `--csv`)
+- [x] Automatically checks for new updates, and notifies user if updates were found.
# 📃 TODO
## GUI
@@ -22,6 +20,19 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
- [x] Add manual dark mode option, that will be persistent in all sessions
- [ ] Make it save results to a CSV file
+# Images
+## GUI
+* 
+* 
+
+## CLI
+* 
+* 
+* 
+
+
+
+
# 📖 Wiki
[Refer to the Wiki](https://github.com/bellingcat/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
From 2a2696403db98f0f6808eba4772a34abd92f4e83 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:52:49 +0200
Subject: [PATCH 14/15] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 3978328..8183c13 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
- [x] Add manual dark mode option, that will be persistent in all sessions
- [ ] Make it save results to a CSV file
-# Images
+# Images & Screenshots
## GUI
* 
* 
From cfef86cbe3851d34691f303712af27192c1e2af0 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 25 Aug 2023 15:53:23 +0200
Subject: [PATCH 15/15] Update README.md
---
RPST GUI/RPST/README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/RPST GUI/RPST/README.md b/RPST GUI/RPST/README.md
index 72e375b..52ea2a6 100644
--- a/RPST GUI/RPST/README.md
+++ b/RPST GUI/RPST/README.md
@@ -20,7 +20,7 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
- [x] Add manual dark mode option, that will be persistent in all sessions
- [ ] Make it save results to a CSV file
-# Images
+# Images & Screenshots
## GUI
* 
* 