mirror of
https://github.com/bellingcat/reddit-post-scraping-tool.git
synced 2026-06-11 21:18:29 +03:00
Add files via upload
This commit is contained in:
11
Dockerfile
Normal file
11
Dockerfile
Normal file
@@ -0,0 +1,11 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
|
||||
FROM python:latest
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN pip install --upgrade pip && pip install build && python -m build && pip install dist/*.whl
|
||||
|
||||
ENTRYPOINT ["reddit_post_scraping_tool"]
|
||||
12
reddit_post_scraping_tool/main.py
Normal file
12
reddit_post_scraping_tool/main.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from reddit_post_scraping_tool.reddit_post_scraping_tool import *
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
reddit_post_scraper()
|
||||
except KeyboardInterrupt:
|
||||
log.warning(f"User interruption detected.")
|
||||
except Exception as e:
|
||||
log.error(e)
|
||||
finally:
|
||||
log.info(f'Finished in {datetime.now() - start_time} seconds.')
|
||||
75
reddit_post_scraping_tool/reddit_post_scraping_tool.py
Normal file
75
reddit_post_scraping_tool/reddit_post_scraping_tool.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import logging
|
||||
import argparse
|
||||
import requests
|
||||
from rich.tree import Tree
|
||||
from datetime import datetime
|
||||
from rich import print as xprint
|
||||
from rich.logging import RichHandler
|
||||
|
||||
start_time = datetime.now()
|
||||
logging.basicConfig(level="NOTSET", format="%(message)s", handlers=[RichHandler(markup=True, log_time_format='[%H:%M:%S%p]')])
|
||||
log = logging.getLogger("rich")
|
||||
|
||||
|
||||
# Getting posts
|
||||
def get_posts(post):
|
||||
post_data = {'Author': post['data']['author'],
|
||||
'ID': post['data']['id'],
|
||||
'Subreddit': post["data"]["subreddit_name_prefixed"],
|
||||
'Visibility': post['data']['subreddit_type'],
|
||||
# 'Author': post["data"]["author_fullname"],
|
||||
'Thumbnail': post["data"]["thumbnail"],
|
||||
# 'Flair': post["data"]["link_flair_text"],
|
||||
'NSFW': post['data']['over_18'],
|
||||
'Gilded': post['data']['gilded'],
|
||||
'Upvotes': post["data"]["ups"],
|
||||
'Upvote ratio': post["data"]["upvote_ratio"],
|
||||
'Downvotes': post["data"]["downs"],
|
||||
'Awards': post["data"]["total_awards_received"],
|
||||
'Top award': post['data']['top_awarded_type'],
|
||||
'Is crosspostable?': post['data']['is_crosspostable'],
|
||||
'Score': post["data"]["score"],
|
||||
'Category': post['data']['category'],
|
||||
'Domain': post["data"]["domain"],
|
||||
'Created': post['data']['created'],
|
||||
'Approved at': post['data']['approved_at_utc'],
|
||||
'Approved by': post['data']['approved_by'], }
|
||||
|
||||
post_tree = Tree("\n" + post['data']['title'])
|
||||
for post_key, post_value in post_data.items():
|
||||
post_tree.add(f"{post_key}: {post_value}")
|
||||
xprint(post_tree)
|
||||
print(post['data']['selftext'] + "\n")
|
||||
|
||||
|
||||
def reddit_post_scraper():
|
||||
session = requests.session()
|
||||
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'}
|
||||
response = session.get(f'https://reddit.com/r/{args.subreddit}/{args.listing}.json?limit={args.limit}&t={args.timeframe}').json()
|
||||
found_posts = 0
|
||||
for post in response['data']['children']:
|
||||
if args.keyword.lower() in post['data']['selftext'] or args.keyword.lower() in post['data']['title']:
|
||||
found_posts += 1
|
||||
get_posts(post)
|
||||
|
||||
log.info(f"Keyword ('{args.keyword}') was found in {found_posts}/{len(response['data']['children'])} {args.listing} posts from r/{args.subreddit}.")
|
||||
|
||||
|
||||
def create_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=f'reddit-post-scraping-tool — by Richard Mwewa | https://about.me/rly0nheart',
|
||||
epilog=f'Given a subreddit name and a keyword, this program returns all top (by default) posts that contain the specified word. ')
|
||||
parser.add_argument('-k', '--keyword', help='kewyword', required=True)
|
||||
parser.add_argument('-s', '--subreddit', help='subreddit', required=True)
|
||||
parser.add_argument('-c', '--limit', help='results limit (1-100) (default: %(default)s)', default=10, type=int)
|
||||
parser.add_argument('-l', '--listing', default='top', const='top', nargs='?',
|
||||
choices=['controversial', 'hot', 'best', 'new', 'rising'],
|
||||
help='listings: controversial, hot, best, new, rising (default: %(default)s)')
|
||||
parser.add_argument('-t', '--timeframe', default='all', const='all', nargs='?',
|
||||
choices=['hour', 'day', 'week', 'month', 'year'],
|
||||
help='timeframe: hour, day, week, month, year (default: %(default)s)')
|
||||
return parser
|
||||
|
||||
|
||||
_parser = create_parser()
|
||||
args = _parser.parse_args()
|
||||
31
setup.py
Normal file
31
setup.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import setuptools
|
||||
|
||||
with open("README.md", "r", encoding="utf-8") as file:
|
||||
long_description = file.read()
|
||||
|
||||
setuptools.setup(
|
||||
name="reddit-post-scraping-tool",
|
||||
version="1.0.0",
|
||||
author="Richard Mwewa",
|
||||
author_email="rly0nheart@duck.com",
|
||||
packages=["reddit_post_scraping_tool"],
|
||||
description="Given a subreddit name and a keyword, this program returns all top (by default) posts that contain the specified word.",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/rly0nheart/reddit-post-scraping-tool",
|
||||
license="MIT License",
|
||||
install_requires=["rich", "requests"],
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Intended Audience :: Information Technology',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Operating System :: OS Independent',
|
||||
'Natural Language :: English',
|
||||
'Programming Language :: Python :: 3'
|
||||
],
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"reddit_post_scraping_tool=reddit_post_scraping_tool.main:main",
|
||||
]
|
||||
},
|
||||
)
|
||||
Reference in New Issue
Block a user