diff --git a/RPST GUI/RPST/RPST.vbproj b/RPST GUI/RPST/RPST.vbproj index 379db43..1062cc1 100644 --- a/RPST GUI/RPST/RPST.vbproj +++ b/RPST GUI/RPST/RPST.vbproj @@ -13,11 +13,11 @@ https://github.com/bellingcat/reddit-post-scraping-tool README.md https://github.com/bellingcat/reddit-post-scraping-tool - 1.6.1.0 - 1.6.1.0 + 1.6.2.0 + 1.6.2.0 LICENSE True - 1.6.1 + 1.6.2 reddit;scraper;reddit-scraper;osint 6.0-recommended diff --git a/pyproject.toml b/pyproject.toml index dc85dcc..cdc299d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ packages = ["rpst"] [project] name = "reddit-post-scraping-tool" -version = "1.6.1.0" +version = "1.6.2.0" description = "Given a subreddit name and a keyword, RPST returns all top (by default) posts that contain the specified keyword." readme = "README.md" requires-python = ">=3.8" diff --git a/rpst/__main.py b/rpst/__main.py index 7edee3e..eba61ae 100644 --- a/rpst/__main.py +++ b/rpst/__main.py @@ -17,7 +17,7 @@ def run(): try: # Check for updates - check_updates(version_tag="1.6.1.0") + check_updates(version_tag="1.6.2.0") # Get posts with the provided/parsed arguments get_posts(arguments=arguments) diff --git a/rpst/__rpst.py b/rpst/__rpst.py index c6a0092..15daae7 100644 --- a/rpst/__rpst.py +++ b/rpst/__rpst.py @@ -37,7 +37,7 @@ def write_post_data(post_data: dict, filename: str) -> str: # Write the data to a JSON file with open(file_path, "a") as file: file.write(json.dumps(post_data)) - file.write("\n") # write a newline to separate posts + file.write("\n") # write a newline to separate posts. return file.name @@ -50,24 +50,24 @@ def check_updates(version_tag: str): :param version_tag: A string representing the current version of the project. """ - # Make a GET request to the GitHub API to get the latest release of the project + # Make a GET request to the GitHub API to get the latest release of the project. response = requests.get( "https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest" ).json() - # Check if the latest release's tag matches the current version tag + # Check if the latest release's tag matches the current version tag. if response["tag_name"] != version_tag: - # If not, convert the release notes from Markdown to HTML + # If not, convert the release notes from Markdown to HTML. raw_release_notes = response["body"] markdown_release_notes = Markdown(raw_release_notes) - # Log an info message about the new release + # Log an info message about the new release. log.info( f"A new release of RPST is available ({response['tag_name']}). " f"Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates." ) - # Print the release notes + # Print the release notes. xprint(markdown_release_notes) @@ -82,20 +82,20 @@ def create_post_branch(post: dict, keyword: str, output: bool, tree: Tree) -> Tr :param tree: Tree where the post branch will be added. :returns: The main tree with added post branches. """ - # Define the data to extract from the post + # Define the data to extract from the post. post_data = { - # 'Author': post['data']['author'], + # "Author": post["data"]["author"], "ID": post["data"]["id"], "Subreddit": post["data"]["subreddit_name_prefixed"], "Visibility": post["data"]["subreddit_type"], "Thumbnail": post["data"]["thumbnail"], - "NSFW": post["data"]["over_18"], "Gilded": post["data"]["gilded"], "Upvotes": post["data"]["ups"], "Upvote ratio": post["data"]["upvote_ratio"], "Downvotes": post["data"]["downs"], "Awards": post["data"]["total_awards_received"], "Top award": post["data"]["top_awarded_type"], + "Is NSFW?": post["data"]["over_18"], "Is crosspostable?": post["data"]["is_crosspostable"], "Score": post["data"]["score"], "Category": post["data"]["category"], @@ -104,19 +104,23 @@ def create_post_branch(post: dict, keyword: str, output: bool, tree: Tree) -> Tr "Approved at": post["data"]["approved_at_utc"], "Approved by": post["data"]["approved_by"], } + + # Add the post's branch to the main tree. + post_branch = tree.add(f":scroll: {post['data']['title']}") + + # Add each piece of extracted data as a branch of the post_branch. + for post_key, post_value in post_data.items(): + post_branch.add(f"{post_key}: {post_value}", style="dim") + + # If -j/--json is passed, write found posts to a json file. if output: + # This ensures that the post's selftext is also added to the written json file. + post_data["Text"] = post["data"]["selftext"] output_file = write_post_data(filename=keyword, post_data=post_data) tree.add( f":page_facing_up: Post data written/appended to " f"[italic][link file://{output_file}]{output_file}[/]" ) - - # Add the post's branch to the main tree. - post_branch = tree.add(f":scroll: {post['data']['title']}") - - # Add each piece of extracted data as a branch of the post_branch - for post_key, post_value in post_data.items(): - post_branch.add(f"{post_key}: {post_value}", style="dim") post_branch.add(post["data"]["selftext"], style="italic") return tree @@ -150,25 +154,25 @@ def get_posts(arguments: argparse): # Start a new session session = requests.session() - # Set the User-Agent to mimic a Safari browser on a Mac + # Set the User-Agent to mimic a Safari browser on a Mac. session.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, " "like Gecko) Version/14.1.1 Safari/605.1.15" } # Send a GET request to the specified subreddit and listing, - # limiting the response by the specified limit and timeframe + # limiting the response by the specified limit and timeframe. response = session.get( f"https://reddit.com/r/{subreddit}/{listing}" f".json?limit={limit}&t={timeframe}" ).json() - # Initialize a counter for the number of posts found that contain the keyword + # Initialize a counter for the number of posts found that contain the keyword. found_posts = 0 # Loop through each post in the response for post_index, post in enumerate(response["data"]["children"], start=1): - # If the keyword is found in the post's selftext or title, increment the counter and process the post + # If the keyword is found in the post's selftext or title, increment the counter and process the post. if ( keyword.lower() in post["data"]["selftext"] or keyword.lower() in post["data"]["title"]