From 69a6968459432ee887c824976a1ef48cb2710c52 Mon Sep 17 00:00:00 2001 From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com> Date: Fri, 10 Feb 2023 19:16:01 +0200 Subject: [PATCH] Add files via upload --- Dockerfile | 11 +++ reddit_post_scraping_tool/main.py | 12 +++ .../reddit_post_scraping_tool.py | 75 +++++++++++++++++++ setup.py | 31 ++++++++ 4 files changed, 129 insertions(+) create mode 100644 Dockerfile create mode 100644 reddit_post_scraping_tool/main.py create mode 100644 reddit_post_scraping_tool/reddit_post_scraping_tool.py create mode 100644 setup.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..69e9374 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +# syntax=docker/dockerfile:1 + +FROM python:latest + +WORKDIR /app + +COPY . . + +RUN pip install --upgrade pip && pip install build && python -m build && pip install dist/*.whl + +ENTRYPOINT ["reddit_post_scraping_tool"] \ No newline at end of file diff --git a/reddit_post_scraping_tool/main.py b/reddit_post_scraping_tool/main.py new file mode 100644 index 0000000..984f17f --- /dev/null +++ b/reddit_post_scraping_tool/main.py @@ -0,0 +1,12 @@ +from reddit_post_scraping_tool.reddit_post_scraping_tool import * + + +def main(): + try: + reddit_post_scraper() + except KeyboardInterrupt: + log.warning(f"User interruption detected.") + except Exception as e: + log.error(e) + finally: + log.info(f'Finished in {datetime.now() - start_time} seconds.') diff --git a/reddit_post_scraping_tool/reddit_post_scraping_tool.py b/reddit_post_scraping_tool/reddit_post_scraping_tool.py new file mode 100644 index 0000000..939d0cd --- /dev/null +++ b/reddit_post_scraping_tool/reddit_post_scraping_tool.py @@ -0,0 +1,75 @@ +import logging +import argparse +import requests +from rich.tree import Tree +from datetime import datetime +from rich import print as xprint +from rich.logging import RichHandler + +start_time = datetime.now() +logging.basicConfig(level="NOTSET", format="%(message)s", handlers=[RichHandler(markup=True, log_time_format='[%H:%M:%S%p]')]) +log = logging.getLogger("rich") + + +# Getting posts +def get_posts(post): + post_data = {'Author': post['data']['author'], + 'ID': post['data']['id'], + 'Subreddit': post["data"]["subreddit_name_prefixed"], + 'Visibility': post['data']['subreddit_type'], + # 'Author': post["data"]["author_fullname"], + 'Thumbnail': post["data"]["thumbnail"], + # 'Flair': post["data"]["link_flair_text"], + 'NSFW': post['data']['over_18'], + 'Gilded': post['data']['gilded'], + 'Upvotes': post["data"]["ups"], + 'Upvote ratio': post["data"]["upvote_ratio"], + 'Downvotes': post["data"]["downs"], + 'Awards': post["data"]["total_awards_received"], + 'Top award': post['data']['top_awarded_type'], + 'Is crosspostable?': post['data']['is_crosspostable'], + 'Score': post["data"]["score"], + 'Category': post['data']['category'], + 'Domain': post["data"]["domain"], + 'Created': post['data']['created'], + 'Approved at': post['data']['approved_at_utc'], + 'Approved by': post['data']['approved_by'], } + + post_tree = Tree("\n" + post['data']['title']) + for post_key, post_value in post_data.items(): + post_tree.add(f"{post_key}: {post_value}") + xprint(post_tree) + print(post['data']['selftext'] + "\n") + + +def reddit_post_scraper(): + session = requests.session() + session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'} + response = session.get(f'https://reddit.com/r/{args.subreddit}/{args.listing}.json?limit={args.limit}&t={args.timeframe}').json() + found_posts = 0 + for post in response['data']['children']: + if args.keyword.lower() in post['data']['selftext'] or args.keyword.lower() in post['data']['title']: + found_posts += 1 + get_posts(post) + + log.info(f"Keyword ('{args.keyword}') was found in {found_posts}/{len(response['data']['children'])} {args.listing} posts from r/{args.subreddit}.") + + +def create_parser(): + parser = argparse.ArgumentParser( + description=f'reddit-post-scraping-tool — by Richard Mwewa | https://about.me/rly0nheart', + epilog=f'Given a subreddit name and a keyword, this program returns all top (by default) posts that contain the specified word. ') + parser.add_argument('-k', '--keyword', help='kewyword', required=True) + parser.add_argument('-s', '--subreddit', help='subreddit', required=True) + parser.add_argument('-c', '--limit', help='results limit (1-100) (default: %(default)s)', default=10, type=int) + parser.add_argument('-l', '--listing', default='top', const='top', nargs='?', + choices=['controversial', 'hot', 'best', 'new', 'rising'], + help='listings: controversial, hot, best, new, rising (default: %(default)s)') + parser.add_argument('-t', '--timeframe', default='all', const='all', nargs='?', + choices=['hour', 'day', 'week', 'month', 'year'], + help='timeframe: hour, day, week, month, year (default: %(default)s)') + return parser + + +_parser = create_parser() +args = _parser.parse_args() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c4289a7 --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +import setuptools + +with open("README.md", "r", encoding="utf-8") as file: + long_description = file.read() + +setuptools.setup( + name="reddit-post-scraping-tool", + version="1.0.0", + author="Richard Mwewa", + author_email="rly0nheart@duck.com", + packages=["reddit_post_scraping_tool"], + description="Given a subreddit name and a keyword, this program returns all top (by default) posts that contain the specified word.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/rly0nheart/reddit-post-scraping-tool", + license="MIT License", + install_requires=["rich", "requests"], + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Information Technology', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Natural Language :: English', + 'Programming Language :: Python :: 3' + ], + entry_points={ + "console_scripts": [ + "reddit_post_scraping_tool=reddit_post_scraping_tool.main:main", + ] + }, +) \ No newline at end of file