Update and rename __rpst.py to rpst.py

1.7.0.0
2026-06-11 21:18:29 +03:00 · 2023-08-25 14:51:16 +02:00
parent cce254e976
commit 3a9a87e67c
2 changed files with 131 additions and 263 deletions
--- a/rpst/__rpst.py
+++ b/rpst/__rpst.py
@@ -1,263 +0,0 @@
-import os
-import json
-import logging
-import argparse
-from datetime import datetime
-
-import requests
-from rich.tree import Tree
-from rich import print as xprint
-from rich.markdown import Markdown
-from rich.logging import RichHandler
-
-
-def convert_timestamp_to_datetime(timestamp: int) -> str:
-    """
-    Converts a Unix timestamp to a formatted datetime string.
-
-    :param timestamp: The Unix timestamp to be converted.
-    :return: A formatted datetime string in the format "dd MMMM yyyy, hh:mm:ssAM/PM".
-    """
-    utc_from_timestamp = datetime.utcfromtimestamp(timestamp)
-    datetime_object = utc_from_timestamp.strftime("%d %B %Y, %I:%M:%S%p")
-    return datetime_object
-
-
-def write_post_data(post_data: dict, filename: str) -> str:
-    """
-    Writes post data to a specified JSON file.
-
-    :param post_data: A dictionary containing post data.
-    :param filename: The name of the file to which post data will be written.
-    :returns: A string representation of the file path.
-    """
-    home_directory = os.path.expanduser("~")
-    file_path = os.path.join(home_directory, f"{filename}.json")
-
-    # Write the data to a JSON file
-    with open(file_path, "a") as file:
-        file.write(json.dumps(post_data))
-        file.write("\n")  # write a newline to separate posts.
-
-    return file.name
-
-
-def check_updates(version_tag: str):
-    """
-    This function checks if there's a new release of a project on GitHub. If there is, it logs an
-    information message and prints the release notes.
-
-    :param version_tag: A string representing the current version of the project.
-    """
-
-    # Make a GET request to the GitHub API to get the latest release of the project.
-    response = requests.get(
-        "https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest"
-    ).json()
-
-    # Check if the latest release's tag matches the current version tag.
-    if response["tag_name"] != version_tag:
-        # If not, convert the release notes from Markdown to HTML.
-        raw_release_notes = response["body"]
-        markdown_release_notes = Markdown(raw_release_notes)
-
-        # Log an info message about the new release.
-        log.info(
-            f"A new release of RPST is available ({response['tag_name']}). "
-            f"Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates."
-        )
-
-        # Print the release notes.
-        xprint(markdown_release_notes)
-
-
-def create_post_branch(post: dict, keyword: str, output: bool, tree: Tree) -> Tree:
-    """
-    This function extracts relevant data from a Reddit post and adds it in a tree branch structure,
-    followed by the post's selftext.
-
-    :param post: A dictionary containing the data of a Reddit post.
-    :param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
-    :param output: If specified, all found posts will be written to a json file.
-    :param tree: Tree where the post branch will be added.
-    :returns: The main tree with added post branches.
-    """
-    # Define the data to extract from the post.
-    post_data = {
-        # "Author": post["data"]["author"],
-        "ID": post["data"]["id"],
-        "Subreddit": post["data"]["subreddit_name_prefixed"],
-        "Visibility": post["data"]["subreddit_type"],
-        "Thumbnail": post["data"]["thumbnail"],
-        "Gilded": post["data"]["gilded"],
-        "Upvotes": post["data"]["ups"],
-        "Upvote ratio": post["data"]["upvote_ratio"],
-        "Downvotes": post["data"]["downs"],
-        "Awards": post["data"]["total_awards_received"],
-        "Top award": post["data"]["top_awarded_type"],
-        "Is NSFW?": post["data"]["over_18"],
-        "Is crosspostable?": post["data"]["is_crosspostable"],
-        "Score": post["data"]["score"],
-        "Category": post["data"]["category"],
-        "Domain": post["data"]["domain"],
-        "Posted on": convert_timestamp_to_datetime(post["data"]["created"]),
-        "Approved at": post["data"]["approved_at_utc"],
-        "Approved by": post["data"]["approved_by"],
-    }
-
-    # Add the post's branch to the main tree.
-    post_branch = tree.add(f":scroll: {post['data']['title']}")
-
-    # Add each piece of extracted data as a branch of the post_branch.
-    for post_key, post_value in post_data.items():
-        post_branch.add(f"{post_key}: {post_value}", style="dim")
-
-    # If -j/--json is passed, write found posts to a json file.
-    if output:
-        # This ensures that the post's selftext is also added to the written json file.
-        post_data["Text"] = post["data"]["selftext"]
-        output_file = write_post_data(filename=keyword, post_data=post_data)
-        tree.add(
-            f":page_facing_up: Post data written/appended to "
-            f"[italic][link file://{output_file}]{output_file}[/]"
-        )
-    post_branch.add(post["data"]["selftext"], style="italic")
-
-    return tree
-
-
-def get_posts(arguments: argparse):
-    """
-    Scrapes a given subreddit for posts that contain a specified keyword.
-    The search is limited by the number of posts and timeframe specified.
-
-    :param arguments: Namespace object from argparse.
-
-    Expected Object Attributes
-    --------------------------
-        - keyword: The keyword to search for in the posts.
-        - subreddit: The subreddit to scrape.
-        - listing: The type of posts to scrape. This could be 'hot', 'new', etc.
-        - timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
-        - limit: The maximum number of posts to scrape.
-        - json: If specified, all found posts will be written to a json file.
-    """
-    keyword = arguments.keyword
-    subreddit = arguments.subreddit
-    listing = arguments.listing
-    timeframe = arguments.timeframe
-    limit = arguments.limit
-    json_output = arguments.json
-
-    # Create main result tree.
-    main_tree = Tree(f"[bold]{datetime.now()}[/]", guide_style="bold bright_blue")
-
-    # Start a new session
-    session = requests.session()
-    # Set the User-Agent to mimic a Safari browser on a Mac.
-    session.headers = {
-        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, "
-        "like Gecko) Version/14.1.1 Safari/605.1.15"
-    }
-
-    # Send a GET request to the specified subreddit and listing,
-    # limiting the response by the specified limit and timeframe.
-    response = session.get(
-        f"https://reddit.com/r/{subreddit}/{listing}"
-        f".json?limit={limit}&t={timeframe}"
-    ).json()
-
-    # Initialize a counter for the number of posts found that contain the keyword.
-    found_posts = 0
-
-    # Loop through each post in the response
-    for post_index, post in enumerate(response["data"]["children"], start=1):
-        # If the keyword is found in the post's selftext or title, increment the counter and process the post.
-        if (
-            keyword.lower() in post["data"]["selftext"]
-            or keyword.lower() in post["data"]["title"]
-        ):
-            # Create a branch for found post(s) and show post index and post author as the title
-            found_tree = main_tree.add(
-                f":bust_in_silhouette: #{post_index} by [bold]@{post['data']['author']}[/]"
-            )
-            found_posts += 1
-            create_post_branch(
-                post=post,
-                keyword=keyword,
-                output=json_output,
-                tree=found_tree,
-            )
-
-    # Log the number of posts in which the keyword was found
-    main_tree.add(
-        f"Keyword ('{keyword}') was found in {found_posts}/{len(response['data']['children'])} "
-        f"{listing} posts from r/{subreddit}."
-    )
-    xprint(main_tree)
-
-
-def create_parser():
-    """
-    Creates and configures an argument parser for the command line arguments.
-
-    :return: A configured argparse.ArgumentParser object ready to parse the command line arguments.
-    """
-    parser = argparse.ArgumentParser(
-        description="RPST (Reddit Post Scraping Tool)  —by Richard Mwewa | https://about.me/rly0nheart",
-        epilog="Given a subreddit name and a keyword, "
-        "RPST returns all top (by default) posts that contain the specified keyword.",
-    )
-
-    parser.add_argument(
-        "-k", "--keyword", help="The keyword to search for in the posts.", required=True
-    )
-    parser.add_argument(
-        "-s", "--subreddit", help="The subreddit to scrape.", required=True
-    )
-    parser.add_argument(
-        "-c",
-        "--limit",
-        help="The maximum number of posts to scrape (1-100). (default: %(default)s)",
-        default=10,
-        type=int,
-        choices=range(
-            1, 101
-        ),  # This enforces that the limit must be between 1 and 100 inclusive.
-    )
-    parser.add_argument(
-        "-l",
-        "--listing",
-        default="top",
-        const="top",
-        nargs="?",
-        choices=["controversial", "hot", "best", "new", "rising"],
-        help="The type of posts to scrape (default: %(default)s)",
-    )
-    parser.add_argument(
-        "-t",
-        "--timeframe",
-        default="all",
-        const="all",
-        nargs="?",
-        choices=["hour", "day", "week", "month", "year", "all"],
-        help="The timeframe from which to scrape posts (default: %(default)s)",
-    )
-    parser.add_argument(
-        "-j",
-        "--json",
-        help="Write all found posts to a json file.",
-        action="store_true",
-    )
-
-    return parser
-
-
-logging.basicConfig(
-    level="NOTSET",
-    format="%(message)s",
-    handlers=[
-        RichHandler(markup=True, log_time_format="[%H:%M:%S%p]", show_level=False)
-    ],
-)
-log = logging.getLogger("rich")
--- a/rpst/rpst.py
+++ b/rpst/rpst.py
@@ -0,0 +1,131 @@
+import argparse
+from datetime import datetime
+
+import requests
+from glyphoji import glyph
+from rich.tree import Tree
+from rich import print as xprint
+
+from .utils import convert_timestamp_to_datetime, write_post_data
+
+
+def create_post_branch(post: dict, keyword: str, tree: Tree, args: argparse) -> Tree:
+    """
+    This function extracts relevant data from a Reddit post and adds it in a tree branch structure,
+    followed by the post's selftext.
+
+    :param post: A dictionary containing the data of a Reddit post.
+    :param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
+    :param tree: Tree where the post branch will be added.
+    :param args: A namespace object from argparse.
+    :returns: The main tree with added post branches.
+    """
+    # Define the data to extract from the post.
+    post_data = {
+        # "Author": post["data"]["author"],
+        f"{glyph.id_button} ID": post["data"]["id"],
+        f"{glyph.people_hugging} Subreddit": post["data"]["subreddit_name_prefixed"],
+        f"{glyph.face_with_peeking_eye} Visibility": post["data"]["subreddit_type"],
+        f"{glyph.framed_picture} Thumbnail": post["data"]["thumbnail"],
+        f"{glyph.white_question_mark}  Gilded": post["data"]["gilded"],
+        f"{glyph.up_arrow} Upvotes": post["data"]["ups"],
+        f"{glyph.chart_increasing} Upvote ratio": post["data"]["upvote_ratio"],
+        f"{glyph.down_arrow} Downvotes": post["data"]["downs"],
+        f"{glyph.trophy} Awards": post["data"]["total_awards_received"],
+        f"{glyph.trophy} Top award": post["data"]["top_awarded_type"],
+        f"{glyph.no_one_under_eighteen} Is NSFW?": post["data"]["over_18"],
+        f"{glyph.left_arrow_curving_right} Is crosspostable?": post["data"][
+            "is_crosspostable"
+        ],
+        f"{glyph.bar_chart} Score": post["data"]["score"],
+        f"{glyph.card_file_box} Category": post["data"]["category"],
+        f"{glyph.globe_with_meridians} Domain": post["data"]["domain"],
+        f"{glyph.calendar} Posted on": convert_timestamp_to_datetime(
+            post["data"]["created"]
+        ),
+        f"{glyph.calendar} Approved at": post["data"]["approved_at_utc"],
+        f"{glyph.bust_in_silhouette} Approved by": post["data"]["approved_by"],
+    }
+
+    # Add the post's branch to the main tree.
+    post_branch = tree.add(f"{glyph.scroll} {post['data']['title']}")
+
+    # Add each piece of extracted data as a branch of the post_branch.
+    for post_key, post_value in post_data.items():
+        post_branch.add(f"{post_key}: {post_value}", style="dim")
+
+    # This ensures that the post's selftext is also added to the written json/csv file.
+    post_data[f"{glyph.clipboard} Text"] = post["data"]["selftext"]
+    write_post_data(
+        filename=keyword, post_data=post_data, tree_branch=post_branch, args=args
+    )
+    post_branch.add(post["data"]["selftext"], style="italic")
+
+    return tree
+
+
+def get_posts(args: argparse):
+    """
+    Scrapes a given subreddit for posts that contain a specified keyword.
+    The search is limited by the number of posts and timeframe specified.
+
+    :param args: Namespace object from argparse.
+
+    Expected Object Attributes
+    --------------------------
+        - keyword: The keyword to search for in the posts.
+        - subreddit: The subreddit to scrape.
+        - listing: The type of posts to scrape. This could be 'hot', 'new', etc.
+        - timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
+        - limit: The maximum number of posts to scrape.
+        - json: If specified, all found posts will be written to a json file.
+    """
+    keyword = args.keyword
+    subreddit = args.subreddit
+    listing = args.listing
+    timeframe = args.timeframe
+    limit = args.limit
+
+    # Create main result tree.
+    main_tree = Tree(
+        f"[bold]{glyph.calendar} {datetime.now()}[/]", guide_style="bold bright_blue"
+    )
+
+    # Start a new session
+    session = requests.session()
+    # Set the User-Agent to mimic a Safari browser on a Mac.
+    session.headers = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, "
+        "like Gecko) Version/14.1.1 Safari/605.1.15"
+    }
+
+    # Send a GET request to the specified subreddit and listing,
+    # limiting the response by the specified limit and timeframe.
+    response = session.get(
+        f"https://reddit.com/r/{subreddit}/{listing}"
+        f".json?limit={limit}&t={timeframe}"
+    ).json()
+
+    # Initialize a counter for the number of posts found that contain the keyword.
+    found_posts = 0
+
+    # Loop through each post in the response
+    for post_index, post in enumerate(response["data"]["children"], start=1):
+        # If the keyword is found in the post's selftext or title, increment the counter and process the post.
+        if (
+            keyword.lower() in post["data"]["selftext"]
+            or keyword.lower() in post["data"]["title"]
+        ):
+            # Create a branch for found post(s) and show post index and post author as the title
+            found_tree = main_tree.add(
+                f"{glyph.bust_in_silhouette} #{post_index} by [bold]@{post['data']['author']}[/]"
+            )
+            found_posts += 1
+            create_post_branch(post=post, keyword=keyword, tree=found_tree, args=args)
+
+    # Log the number of posts in which the keyword was found
+    main_tree.add(
+        f"{glyph.check_mark_button}  Keyword ('{keyword}') was found in "
+        f"{found_posts}/{len(response['data']['children'])} {listing} posts from r/{subreddit}."
+    )
+    xprint(main_tree)