Files
reddit-post-scraping-tool/reddit_post_scraping_tool/reddit_post_scraping_tool.py
2023-03-01 23:07:52 +02:00

92 lines
4.5 KiB
Python

import logging
import argparse
import requests
from rich.tree import Tree
from datetime import datetime
from rich import print as xprint
from rich.markdown import Markdown
from rich.logging import RichHandler
start_time = datetime.now()
logging.basicConfig(level="NOTSET", format="%(message)s", handlers=[RichHandler(markup=True, log_time_format='[%H:%M:%S%p]')])
log = logging.getLogger("rich")
# Check if the remote tag_name from the latest release matches the one in the program
# if it does, it means the program is up-to-date.
# If it doesn't match, notify the user about a new release
def check_updates(version_tag):
response = requests.get("https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest").json()
if response['tag_name'] == version_tag:
pass
else:
raw_release_notes = response['body']
markdown_release_notes = Markdown(raw_release_notes)
log.info(f"A new release of reddit-post-scraping-tool is available ({response['tag_name']}). Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates.")
xprint(markdown_release_notes)
# Getting posts
def get_posts(post):
post_data = {'Author': post['data']['author'],
'ID': post['data']['id'],
'Subreddit': post["data"]["subreddit_name_prefixed"],
'Visibility': post['data']['subreddit_type'],
# 'Author': post["data"]["author_fullname"],
'Thumbnail': post["data"]["thumbnail"],
# 'Flair': post["data"]["link_flair_text"],
'NSFW': post['data']['over_18'],
'Gilded': post['data']['gilded'],
'Upvotes': post["data"]["ups"],
'Upvote ratio': post["data"]["upvote_ratio"],
'Downvotes': post["data"]["downs"],
'Awards': post["data"]["total_awards_received"],
'Top award': post['data']['top_awarded_type'],
'Is crosspostable?': post['data']['is_crosspostable'],
'Score': post["data"]["score"],
'Category': post['data']['category'],
'Domain': post["data"]["domain"],
'Created': post['data']['created'],
'Approved at': post['data']['approved_at_utc'],
'Approved by': post['data']['approved_by'], }
post_tree = Tree("\n" + post['data']['title'])
for post_key, post_value in post_data.items():
post_tree.add(f"{post_key}: {post_value}")
xprint(post_tree)
print(post['data']['selftext'] + "\n")
def reddit_post_scraper():
session = requests.session()
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'}
response = session.get(f'https://reddit.com/r/{args.subreddit}/{args.listing}.json?limit={args.limit}&t={args.timeframe}').json()
found_posts = 0
for post in response['data']['children']:
if args.keyword.lower() in post['data']['selftext'] or args.keyword.lower() in post['data']['title']:
found_posts += 1
get_posts(post)
log.info(f"Keyword ('{args.keyword}') was found in {found_posts}/{len(response['data']['children'])} {args.listing} posts from r/{args.subreddit}.")
def create_parser():
parser = argparse.ArgumentParser(
description=f'reddit-post-scraping-tool — by Richard Mwewa | https://about.me/rly0nheart',
epilog=f'Given a subreddit name and a keyword, this program returns all top (by default) posts that contain the specified word. ')
parser.add_argument('-k', '--keyword', help='kewyword', required=True)
parser.add_argument('-s', '--subreddit', help='subreddit', required=True)
parser.add_argument('-c', '--limit', help='results limit (1-100) (default: %(default)s)', default=10, type=int)
parser.add_argument('-l', '--listing', default='top', const='top', nargs='?',
choices=['controversial', 'hot', 'best', 'new', 'rising'],
help='listings: controversial, hot, best, new, rising (default: %(default)s)')
parser.add_argument('-t', '--timeframe', default='all', const='all', nargs='?',
choices=['hour', 'day', 'week', 'month', 'year'],
help='timeframe: hour, day, week, month, year (default: %(default)s)')
return parser
_parser = create_parser()
args = _parser.parse_args()