From 002dd57c0d430c10390f48e2a881e53a1b347197 Mon Sep 17 00:00:00 2001 From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com> Date: Sun, 6 Aug 2023 05:34:13 +0200 Subject: [PATCH] Update __rpst_.py Added json logger for found posts --- rpst/__rpst_.py | 66 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/rpst/__rpst_.py b/rpst/__rpst_.py index 5e2bef1..8040107 100644 --- a/rpst/__rpst_.py +++ b/rpst/__rpst_.py @@ -1,3 +1,4 @@ +import json import logging import argparse import requests @@ -7,9 +8,24 @@ from rich.markdown import Markdown from rich.logging import RichHandler +def write_post_data(post_data: dict, filename: str): + """ + Writes post data to a specified JSON file. + + :param post_data: A dictionary containing post data. + :param filename: The name of the file to which post data will be written. + """ + # Write the data to a JSON file + with open(filename + ".json", 'a') as file: + file.write(json.dumps(post_data)) + file.write('\n') # write a newline to separate posts + + log.info(f"Post data written to '{file.name}'") + + def check_updates(version_tag: str): """ - Checks if there's a new release of a project on GitHub. If there is, it logs an + This function checks if there's a new release of a project on GitHub. If there is, it logs an information message and prints the release notes. :param version_tag: A string representing the current version of the project. @@ -35,12 +51,14 @@ def check_updates(version_tag: str): xprint(markdown_release_notes) -def get_post_data(post: dict): +def format_post_data(post: dict, keyword: str, output: bool): """ - Extracts relevant data from a Reddit post and displays it in a tree structure, + This function extracts relevant data from a Reddit post and displays it in a tree structure, followed by the post's selftext. :param post: A dictionary containing the data of a Reddit post. + :param keyword: The keyword that is used to find posts, in his case gets uses as the filename. + :param output: If specified, all found posts will be written to a json file. """ # Define the data to extract from the post post_data = { @@ -64,7 +82,8 @@ def get_post_data(post: dict): 'Approved at': post['data']['approved_at_utc'], 'Approved by': post['data']['approved_by'], } - + if output: + write_post_data(filename=keyword, post_data=post_data) # Create a tree structure with the post's title as the root post_tree = Tree("\n" + post['data']['title']) @@ -79,19 +98,31 @@ def get_post_data(post: dict): print(post['data']['selftext'] + "\n") -def start_scraper(keyword: str, subreddit: str, listing: str, timeframe: str, limit: int): +def get_posts(arguments: argparse): """ Scrapes a given subreddit for posts that contain a specified keyword. - The search is limited by the number of posts and timeframe specified. + The search is limited by the number of posts and timeframe specified. The results are either + printed to the console or saved to a specified file, based on the 'output' argument. - :param keyword: The keyword to search for in the posts. - :param subreddit: The subreddit to scrape. - :param listing: The type of posts to scrape. This could be 'hot', 'new', etc. - :param timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc. - :param limit: The maximum number of posts to scrape. + :param arguments: Namespace object from argparse. - This function logs the number of posts in which the keyword was found. + Expected Object Attributes + -------------------------- + - keyword: The keyword to search for in the posts. + - subreddit: The subreddit to scrape. + - listing: The type of posts to scrape. This could be 'hot', 'new', etc. + - timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc. + - limit: The maximum number of posts to scrape. + - json: If specified, all found posts will be written to a json file. + + Also logs the number of posts in which the keyword was found. """ + keyword = arguments.keyword + subreddit = arguments.subreddit + listing = arguments.listing + timeframe = arguments.timeframe + limit = arguments.limit + json_output = arguments.json # Start a new session session = requests.session() @@ -101,7 +132,8 @@ def start_scraper(keyword: str, subreddit: str, listing: str, timeframe: str, li # Send a GET request to the specified subreddit and listing, # limiting the response by the specified limit and timeframe - response = session.get(f'https://reddit.com/r/{subreddit}/{listing}.json?limit={limit}&t={timeframe}').json() + response = session.get(f'https://reddit.com/r/{subreddit}/{listing}' + f'.json?limit={limit}&t={timeframe}').json() # Initialize a counter for the number of posts found that contain the keyword found_posts = 0 @@ -111,7 +143,7 @@ def start_scraper(keyword: str, subreddit: str, listing: str, timeframe: str, li # If the keyword is found in the post's selftext or title, increment the counter and process the post if keyword.lower() in post['data']['selftext'] or keyword.lower() in post['data']['title']: found_posts += 1 - get_post_data(post=post) + format_post_data(post=post, keyword=keyword, output=json_output) # Log the number of posts in which the keyword was found log.info(f"Keyword ('{keyword}') was found in {found_posts}/{len(response['data']['children'])} " @@ -136,7 +168,6 @@ def create_parser(): '-c', '--limit', help='The maximum number of posts to scrape (1-100). (default: %(default)s)', default=10, - const=10, type=int, choices=range(1, 101) # This enforces that the limit must be between 1 and 100 inclusive. ) @@ -156,6 +187,11 @@ def create_parser(): choices=['hour', 'day', 'week', 'month', 'year', 'all'], help='The timeframe from which to scrape posts (default: %(default)s)' ) + parser.add_argument( + '-j', '--json', + help='Write all found posts to a json file.', + action='store_true' + ) return parser