mirror of
https://github.com/bellingcat/reddit-post-scraping-tool.git
synced 2026-06-10 20:48:29 +03:00
Compare commits
31 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
750967c322 | ||
|
|
cfef86cbe3 | ||
|
|
2a2696403d | ||
|
|
b0a8d75d8c | ||
|
|
b31c38f5cc | ||
|
|
b5b7df868e | ||
|
|
566f558720 | ||
|
|
c3e5ce6441 | ||
|
|
7c164938c9 | ||
|
|
b08c4a147b | ||
|
|
8f259b7a40 | ||
|
|
f117c99cc7 | ||
|
|
3a9a87e67c | ||
|
|
cce254e976 | ||
|
|
418b2acc4c | ||
|
|
d26699cc1f | ||
|
|
9efb1cea4a | ||
|
|
ba6eeb38a6 | ||
|
|
2053c0f0bc | ||
|
|
8bef73001c | ||
|
|
c9d9628326 | ||
|
|
2f1619b4c5 | ||
|
|
33db66dbc3 | ||
|
|
bbbdab906d | ||
|
|
74264224a5 | ||
|
|
ce75d40f76 | ||
|
|
406e34c4bb | ||
|
|
38140ea2be | ||
|
|
4c3d3a688f | ||
|
|
a03b649904 | ||
|
|
aa3b506a96 |
29
README.md
29
README.md
@@ -1,20 +1,18 @@
|
||||
# RPST (Reddit Post Scraping Tool)
|
||||
Given a subreddit name and a keyword, RPST will return all posts from a specified listing (default is 'top') that contain the provided keyword.
|
||||
|
||||
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
|
||||

|
||||

|
||||
***
|
||||
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
|
||||
|
||||
# ✅ Features
|
||||
## GUI
|
||||
- [x] Dark mode (Right-click)
|
||||
- [x] Saves results to a JSON (Right-click)
|
||||
- [x] Dark mode (*Right-click*)
|
||||
- [x] Saves results to a JSON file (*Right-click*)
|
||||
- [x] Logs errors to a file
|
||||
|
||||
## CLI
|
||||
- [x] Saves results to a JSON (-j/--json)
|
||||
- [x] Automatically checks for new updates. Notifies user if update were found.
|
||||
- [x] Saves results to JSON (*specifiy* `--json`)
|
||||
- [x] Saves results to CSV (*specify* `--csv`)
|
||||
- [x] Automatically checks for new updates, and notifies user if updates were found.
|
||||
|
||||
# 📃 TODO
|
||||
## GUI
|
||||
@@ -22,8 +20,21 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
|
||||
- [x] Add manual dark mode option, that will be persistent in all sessions
|
||||
- [ ] Make it save results to a CSV file
|
||||
|
||||
# Images & Screenshots
|
||||
## GUI
|
||||
* 
|
||||
* 
|
||||
|
||||
## CLI
|
||||
* 
|
||||
* 
|
||||
* 
|
||||
|
||||
|
||||
|
||||
|
||||
# 📖 Wiki
|
||||
[Refer to the Wiki](https://github.com/rly0nheart/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
|
||||
[Refer to the Wiki](https://github.com/bellingcat/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
|
||||
|
||||
# 😁 Donations
|
||||
If you like `RPST` and would like to show support, you can Buy A Coffee for the developer using the button below
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
# RPST (Reddit Post Scraping Tool)
|
||||
Given a subreddit name and a keyword, RPST will return all posts from a specified listing (default is 'top') that contain the provided keyword.
|
||||
|
||||
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
|
||||

|
||||

|
||||
***
|
||||
[](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/python-publish.yml) [](https://github.com/rly0nheart/reddit-post-scraping-tool/actions/workflows/codeql.yml)  
|
||||
|
||||
# ✅ Features
|
||||
## GUI
|
||||
- [x] Dark mode (Right-click)
|
||||
- [x] Saves results to a JSON (Right-click)
|
||||
- [x] Dark mode (*Right-click*)
|
||||
- [x] Saves results to a JSON file (*Right-click*)
|
||||
- [x] Logs errors to a file
|
||||
|
||||
## CLI
|
||||
- [x] Saves results to a JSON (-j/--json)
|
||||
- [x] Automatically checks for new updates. Notifies user if update were found.
|
||||
- [x] Saves results to JSON (*specifiy* `--json`)
|
||||
- [x] Saves results to CSV (*specify* `--csv`)
|
||||
- [x] Automatically checks for new updates, and notifies user if updates were found.
|
||||
|
||||
# 📃 TODO
|
||||
## GUI
|
||||
@@ -22,8 +20,21 @@ Given a subreddit name and a keyword, RPST will return all posts from a specifie
|
||||
- [x] Add manual dark mode option, that will be persistent in all sessions
|
||||
- [ ] Make it save results to a CSV file
|
||||
|
||||
# Images & Screenshots
|
||||
## GUI
|
||||
* 
|
||||
* 
|
||||
|
||||
## CLI
|
||||
* 
|
||||
* 
|
||||
* 
|
||||
|
||||
|
||||
|
||||
|
||||
# 📖 Wiki
|
||||
[Refer to the Wiki](https://github.com/rly0nheart/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
|
||||
[Refer to the Wiki](https://github.com/bellingcat/reddit-post-scraping-tool/wiki) for installation instructions, in addition to all other documentation.
|
||||
|
||||
# 😁 Donations
|
||||
If you like `RPST` and would like to show support, you can Buy A Coffee for the developer using the button below
|
||||
|
||||
@@ -13,11 +13,11 @@
|
||||
<PackageProjectUrl>https://github.com/bellingcat/reddit-post-scraping-tool</PackageProjectUrl>
|
||||
<PackageReadmeFile>README.md</PackageReadmeFile>
|
||||
<RepositoryUrl>https://github.com/bellingcat/reddit-post-scraping-tool</RepositoryUrl>
|
||||
<AssemblyVersion>1.6.0.0</AssemblyVersion>
|
||||
<FileVersion>1.6.0.0</FileVersion>
|
||||
<AssemblyVersion>1.7.0.0</AssemblyVersion>
|
||||
<FileVersion>1.7.0.0</FileVersion>
|
||||
<PackageLicenseFile>LICENSE</PackageLicenseFile>
|
||||
<PackageRequireLicenseAcceptance>True</PackageRequireLicenseAcceptance>
|
||||
<Version>1.6.0</Version>
|
||||
<Version>1.7.0</Version>
|
||||
<PackageTags>reddit;scraper;reddit-scraper;osint</PackageTags>
|
||||
<PackageReleaseNotes></PackageReleaseNotes>
|
||||
<AnalysisLevel>6.0-recommended</AnalysisLevel>
|
||||
@@ -39,7 +39,7 @@
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Newtonsoft.Json" Version="13.0.2" />
|
||||
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
@@ -78,4 +78,4 @@
|
||||
</None>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -7,18 +7,18 @@ packages = ["rpst"]
|
||||
|
||||
[project]
|
||||
name = "reddit-post-scraping-tool"
|
||||
version = "1.6.0.0"
|
||||
version = "1.7.0.0"
|
||||
description = "Given a subreddit name and a keyword, RPST returns all top (by default) posts that contain the specified keyword."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
license = {file = "LICENSE"}
|
||||
keywords = ["osint", "reddit-crawler", "reddit-scraping", "reddit"]
|
||||
keywords = ["reddit-crawler", "reddit-scraping", "reddit", "reddit-api"]
|
||||
authors = [{name = "Richard Mwewa", email = "rly0nheart@duck.com"}]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Visual Basic",
|
||||
"Intended Audience :: Information Technology",
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Natural Language :: English"
|
||||
@@ -26,6 +26,7 @@ classifiers = [
|
||||
|
||||
dependencies = [
|
||||
"rich",
|
||||
"glyphoji",
|
||||
"requests",
|
||||
]
|
||||
|
||||
@@ -35,4 +36,4 @@ documentation = "https://github.com/bellingcat/reddit-post-scraping-tool/wiki"
|
||||
repository = "https://github.com/bellingcat/reddit-post-scraping-tool.git"
|
||||
|
||||
[project.scripts]
|
||||
rpst = "rpst.__main:run"
|
||||
rpst = "rpst.main:run"
|
||||
|
||||
1
rpst/__init__.py
Normal file
1
rpst/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
201
rpst/__rpst_.py
201
rpst/__rpst_.py
@@ -1,201 +0,0 @@
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
import requests
|
||||
from rich.tree import Tree
|
||||
from rich import print as xprint
|
||||
from rich.markdown import Markdown
|
||||
from rich.logging import RichHandler
|
||||
|
||||
|
||||
def write_post_data(post_data: dict, filename: str):
|
||||
"""
|
||||
Writes post data to a specified JSON file.
|
||||
|
||||
:param post_data: A dictionary containing post data.
|
||||
:param filename: The name of the file to which post data will be written.
|
||||
"""
|
||||
# Write the data to a JSON file
|
||||
with open(filename + ".json", 'a') as file:
|
||||
file.write(json.dumps(post_data))
|
||||
file.write('\n') # write a newline to separate posts
|
||||
|
||||
log.info(f"Post data written to '{file.name}'")
|
||||
|
||||
|
||||
def check_updates(version_tag: str):
|
||||
"""
|
||||
This function checks if there's a new release of a project on GitHub. If there is, it logs an
|
||||
information message and prints the release notes.
|
||||
|
||||
:param version_tag: A string representing the current version of the project.
|
||||
"""
|
||||
|
||||
# Make a GET request to the GitHub API to get the latest release of the project
|
||||
response = requests.get("https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest").json()
|
||||
|
||||
# Check if the latest release's tag matches the current version tag
|
||||
if response['tag_name'] != version_tag:
|
||||
|
||||
# If not, convert the release notes from Markdown to HTML
|
||||
raw_release_notes = response['body']
|
||||
markdown_release_notes = Markdown(raw_release_notes)
|
||||
|
||||
# Log an info message about the new release
|
||||
log.info(
|
||||
f"A new release of RPST is available ({response['tag_name']}). "
|
||||
f"Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates."
|
||||
)
|
||||
|
||||
# Print the release notes
|
||||
xprint(markdown_release_notes)
|
||||
|
||||
|
||||
def format_post_data(post: dict, keyword: str, output: bool):
|
||||
"""
|
||||
This function extracts relevant data from a Reddit post and displays it in a tree structure,
|
||||
followed by the post's selftext.
|
||||
|
||||
:param post: A dictionary containing the data of a Reddit post.
|
||||
:param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
|
||||
:param output: If specified, all found posts will be written to a json file.
|
||||
"""
|
||||
# Define the data to extract from the post
|
||||
post_data = {
|
||||
'Author': post['data']['author'],
|
||||
'ID': post['data']['id'],
|
||||
'Subreddit': post["data"]["subreddit_name_prefixed"],
|
||||
'Visibility': post['data']['subreddit_type'],
|
||||
'Thumbnail': post["data"]["thumbnail"],
|
||||
'NSFW': post['data']['over_18'],
|
||||
'Gilded': post['data']['gilded'],
|
||||
'Upvotes': post["data"]["ups"],
|
||||
'Upvote ratio': post["data"]["upvote_ratio"],
|
||||
'Downvotes': post["data"]["downs"],
|
||||
'Awards': post["data"]["total_awards_received"],
|
||||
'Top award': post['data']['top_awarded_type'],
|
||||
'Is crosspostable?': post['data']['is_crosspostable'],
|
||||
'Score': post["data"]["score"],
|
||||
'Category': post['data']['category'],
|
||||
'Domain': post["data"]["domain"],
|
||||
'Created': post['data']['created'],
|
||||
'Approved at': post['data']['approved_at_utc'],
|
||||
'Approved by': post['data']['approved_by'],
|
||||
}
|
||||
if output:
|
||||
write_post_data(filename=keyword, post_data=post_data)
|
||||
# Create a tree structure with the post's title as the root
|
||||
post_tree = Tree("\n" + post['data']['title'])
|
||||
|
||||
# Add each piece of extracted data as a branch of the tree
|
||||
for post_key, post_value in post_data.items():
|
||||
post_tree.add(f"{post_key}: {post_value}")
|
||||
|
||||
# Print the tree structure
|
||||
xprint(post_tree)
|
||||
|
||||
# Print the post's selftext
|
||||
print(post['data']['selftext'] + "\n")
|
||||
|
||||
|
||||
def get_posts(arguments: argparse):
|
||||
"""
|
||||
Scrapes a given subreddit for posts that contain a specified keyword.
|
||||
The search is limited by the number of posts and timeframe specified. The results are either
|
||||
printed to the console or saved to a specified file, based on the 'output' argument.
|
||||
|
||||
:param arguments: Namespace object from argparse.
|
||||
|
||||
Expected Object Attributes
|
||||
--------------------------
|
||||
- keyword: The keyword to search for in the posts.
|
||||
- subreddit: The subreddit to scrape.
|
||||
- listing: The type of posts to scrape. This could be 'hot', 'new', etc.
|
||||
- timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
|
||||
- limit: The maximum number of posts to scrape.
|
||||
- json: If specified, all found posts will be written to a json file.
|
||||
|
||||
Also logs the number of posts in which the keyword was found.
|
||||
"""
|
||||
keyword = arguments.keyword
|
||||
subreddit = arguments.subreddit
|
||||
listing = arguments.listing
|
||||
timeframe = arguments.timeframe
|
||||
limit = arguments.limit
|
||||
json_output = arguments.json
|
||||
|
||||
# Start a new session
|
||||
session = requests.session()
|
||||
# Set the User-Agent to mimic a Safari browser on a Mac
|
||||
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, '
|
||||
'like Gecko) Version/14.1.1 Safari/605.1.15'}
|
||||
|
||||
# Send a GET request to the specified subreddit and listing,
|
||||
# limiting the response by the specified limit and timeframe
|
||||
response = session.get(f'https://reddit.com/r/{subreddit}/{listing}'
|
||||
f'.json?limit={limit}&t={timeframe}').json()
|
||||
|
||||
# Initialize a counter for the number of posts found that contain the keyword
|
||||
found_posts = 0
|
||||
|
||||
# Loop through each post in the response
|
||||
for post in response['data']['children']:
|
||||
# If the keyword is found in the post's selftext or title, increment the counter and process the post
|
||||
if keyword.lower() in post['data']['selftext'] or keyword.lower() in post['data']['title']:
|
||||
found_posts += 1
|
||||
format_post_data(post=post, keyword=keyword, output=json_output)
|
||||
|
||||
# Log the number of posts in which the keyword was found
|
||||
log.info(f"Keyword ('{keyword}') was found in {found_posts}/{len(response['data']['children'])} "
|
||||
f"{listing} posts from r/{subreddit}.")
|
||||
|
||||
|
||||
def create_parser():
|
||||
"""
|
||||
Creates and configures an argument parser for the command line arguments.
|
||||
|
||||
:return: A configured argparse.ArgumentParser object ready to parse the command line arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='RPST: Reddit Post Scraping Tool —by Richard Mwewa | https://about.me/rly0nheart',
|
||||
epilog='Given a subreddit name and a keyword, '
|
||||
'RPST returns all top (by default) posts that contain the specified keyword.'
|
||||
)
|
||||
|
||||
parser.add_argument('-k', '--keyword', help='The keyword to search for in the posts.', required=True)
|
||||
parser.add_argument('-s', '--subreddit', help='The subreddit to scrape.', required=True)
|
||||
parser.add_argument(
|
||||
'-c', '--limit',
|
||||
help='The maximum number of posts to scrape (1-100). (default: %(default)s)',
|
||||
default=10,
|
||||
type=int,
|
||||
choices=range(1, 101) # This enforces that the limit must be between 1 and 100 inclusive.
|
||||
)
|
||||
parser.add_argument(
|
||||
'-l', '--listing',
|
||||
default='top',
|
||||
const='top',
|
||||
nargs='?',
|
||||
choices=['controversial', 'hot', 'best', 'new', 'rising'],
|
||||
help='The type of posts to scrape (default: %(default)s)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-t', '--timeframe',
|
||||
default='all',
|
||||
const='all',
|
||||
nargs='?',
|
||||
choices=['hour', 'day', 'week', 'month', 'year', 'all'],
|
||||
help='The timeframe from which to scrape posts (default: %(default)s)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-j', '--json',
|
||||
help='Write all found posts to a json file.',
|
||||
action='store_true'
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
logging.basicConfig(level="NOTSET", format="%(message)s",
|
||||
handlers=[RichHandler(markup=True, log_time_format='[%H:%M:%S%p]')])
|
||||
log = logging.getLogger("rich")
|
||||
@@ -1,5 +1,7 @@
|
||||
from datetime import datetime
|
||||
from rpst.__rpst import log, get_posts, check_updates, create_parser
|
||||
|
||||
from .rpst import get_posts
|
||||
from .utils import create_parser, set_loglevel, check_updates
|
||||
|
||||
|
||||
def run():
|
||||
@@ -10,20 +12,22 @@ def run():
|
||||
|
||||
# Create a parser and parse the command line arguments
|
||||
parser = create_parser()
|
||||
arguments = parser.parse_args()
|
||||
args = parser.parse_args()
|
||||
|
||||
log = set_loglevel(args=args)
|
||||
|
||||
# Record the start time
|
||||
start_time = datetime.now()
|
||||
|
||||
try:
|
||||
# Check for updates
|
||||
check_updates(version_tag="1.6.0.0")
|
||||
check_updates(version_tag="1.7.0.0")
|
||||
|
||||
# Get posts with the provided/parsed arguments
|
||||
get_posts(arguments=arguments)
|
||||
get_posts(args=args)
|
||||
except KeyboardInterrupt:
|
||||
log.warning("User interruption detected.")
|
||||
except Exception as e:
|
||||
log.error(f"An error occurred: {e}")
|
||||
finally:
|
||||
log.info(f'Finished in {datetime.now() - start_time} seconds.')
|
||||
log.info(f"Finished in {datetime.now() - start_time} seconds.")
|
||||
131
rpst/rpst.py
Normal file
131
rpst/rpst.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from glyphoji import glyph
|
||||
from rich.tree import Tree
|
||||
from rich import print as xprint
|
||||
|
||||
from .utils import convert_timestamp_to_datetime, write_post_data
|
||||
|
||||
|
||||
def create_post_branch(post: dict, keyword: str, tree: Tree, args: argparse) -> Tree:
|
||||
"""
|
||||
This function extracts relevant data from a Reddit post and adds it in a tree branch structure,
|
||||
followed by the post's selftext.
|
||||
|
||||
:param post: A dictionary containing the data of a Reddit post.
|
||||
:param keyword: The keyword that is used to find posts, in his case gets uses as the filename.
|
||||
:param tree: Tree where the post branch will be added.
|
||||
:param args: A namespace object from argparse.
|
||||
:returns: The main tree with added post branches.
|
||||
"""
|
||||
# Define the data to extract from the post.
|
||||
post_data = {
|
||||
# "Author": post["data"]["author"],
|
||||
f"{glyph.id_button} ID": post["data"]["id"],
|
||||
f"{glyph.people_hugging} Subreddit": post["data"]["subreddit_name_prefixed"],
|
||||
f"{glyph.face_with_peeking_eye} Visibility": post["data"]["subreddit_type"],
|
||||
f"{glyph.framed_picture} Thumbnail": post["data"]["thumbnail"],
|
||||
f"{glyph.white_question_mark} Gilded": post["data"]["gilded"],
|
||||
f"{glyph.up_arrow} Upvotes": post["data"]["ups"],
|
||||
f"{glyph.chart_increasing} Upvote ratio": post["data"]["upvote_ratio"],
|
||||
f"{glyph.down_arrow} Downvotes": post["data"]["downs"],
|
||||
f"{glyph.trophy} Awards": post["data"]["total_awards_received"],
|
||||
f"{glyph.trophy} Top award": post["data"]["top_awarded_type"],
|
||||
f"{glyph.no_one_under_eighteen} Is NSFW?": post["data"]["over_18"],
|
||||
f"{glyph.left_arrow_curving_right} Is crosspostable?": post["data"][
|
||||
"is_crosspostable"
|
||||
],
|
||||
f"{glyph.bar_chart} Score": post["data"]["score"],
|
||||
f"{glyph.card_file_box} Category": post["data"]["category"],
|
||||
f"{glyph.globe_with_meridians} Domain": post["data"]["domain"],
|
||||
f"{glyph.calendar} Posted on": convert_timestamp_to_datetime(
|
||||
post["data"]["created"]
|
||||
),
|
||||
f"{glyph.calendar} Approved at": post["data"]["approved_at_utc"],
|
||||
f"{glyph.bust_in_silhouette} Approved by": post["data"]["approved_by"],
|
||||
}
|
||||
|
||||
# Add the post's branch to the main tree.
|
||||
post_branch = tree.add(f"{glyph.scroll} {post['data']['title']}")
|
||||
|
||||
# Add each piece of extracted data as a branch of the post_branch.
|
||||
for post_key, post_value in post_data.items():
|
||||
post_branch.add(f"{post_key}: {post_value}", style="dim")
|
||||
|
||||
# This ensures that the post's selftext is also added to the written json/csv file.
|
||||
post_data[f"{glyph.clipboard} Text"] = post["data"]["selftext"]
|
||||
write_post_data(
|
||||
filename=keyword, post_data=post_data, tree_branch=post_branch, args=args
|
||||
)
|
||||
post_branch.add(post["data"]["selftext"], style="italic")
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
def get_posts(args: argparse):
|
||||
"""
|
||||
Scrapes a given subreddit for posts that contain a specified keyword.
|
||||
The search is limited by the number of posts and timeframe specified.
|
||||
|
||||
:param args: Namespace object from argparse.
|
||||
|
||||
Expected Object Attributes
|
||||
--------------------------
|
||||
- keyword: The keyword to search for in the posts.
|
||||
- subreddit: The subreddit to scrape.
|
||||
- listing: The type of posts to scrape. This could be 'hot', 'new', etc.
|
||||
- timeframe: The timeframe from which to scrape posts. This could be 'day', 'week', etc.
|
||||
- limit: The maximum number of posts to scrape.
|
||||
- json: If specified, all found posts will be written to a json file.
|
||||
"""
|
||||
keyword = args.keyword
|
||||
subreddit = args.subreddit
|
||||
listing = args.listing
|
||||
timeframe = args.timeframe
|
||||
limit = args.limit
|
||||
|
||||
# Create main result tree.
|
||||
main_tree = Tree(
|
||||
f"[bold]{glyph.calendar} {datetime.now()}[/]", guide_style="bold bright_blue"
|
||||
)
|
||||
|
||||
# Start a new session
|
||||
session = requests.session()
|
||||
# Set the User-Agent to mimic a Safari browser on a Mac.
|
||||
session.headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, "
|
||||
"like Gecko) Version/14.1.1 Safari/605.1.15"
|
||||
}
|
||||
|
||||
# Send a GET request to the specified subreddit and listing,
|
||||
# limiting the response by the specified limit and timeframe.
|
||||
response = session.get(
|
||||
f"https://reddit.com/r/{subreddit}/{listing}"
|
||||
f".json?limit={limit}&t={timeframe}"
|
||||
).json()
|
||||
|
||||
# Initialize a counter for the number of posts found that contain the keyword.
|
||||
found_posts = 0
|
||||
|
||||
# Loop through each post in the response
|
||||
for post_index, post in enumerate(response["data"]["children"], start=1):
|
||||
# If the keyword is found in the post's selftext or title, increment the counter and process the post.
|
||||
if (
|
||||
keyword.lower() in post["data"]["selftext"]
|
||||
or keyword.lower() in post["data"]["title"]
|
||||
):
|
||||
# Create a branch for found post(s) and show post index and post author as the title
|
||||
found_tree = main_tree.add(
|
||||
f"{glyph.bust_in_silhouette} #{post_index} by [bold]@{post['data']['author']}[/]"
|
||||
)
|
||||
found_posts += 1
|
||||
create_post_branch(post=post, keyword=keyword, tree=found_tree, args=args)
|
||||
|
||||
# Log the number of posts in which the keyword was found
|
||||
main_tree.add(
|
||||
f"{glyph.check_mark_button} Keyword ('{keyword}') was found in "
|
||||
f"{found_posts}/{len(response['data']['children'])} {listing} posts from r/{subreddit}."
|
||||
)
|
||||
xprint(main_tree)
|
||||
202
rpst/utils.py
Normal file
202
rpst/utils.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import os
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from glyphoji import glyph
|
||||
from rich.tree import Tree
|
||||
from rich import print as xprint
|
||||
from rich.markdown import Markdown
|
||||
from rich.logging import RichHandler
|
||||
|
||||
|
||||
def convert_timestamp_to_datetime(timestamp: int) -> str:
|
||||
"""
|
||||
Converts a Unix timestamp to a formatted datetime string.
|
||||
|
||||
:param timestamp: The Unix timestamp to be converted.
|
||||
:return: A formatted datetime string in the format "dd MMMM yyyy, hh:mm:ssAM/PM".
|
||||
"""
|
||||
utc_from_timestamp = datetime.utcfromtimestamp(timestamp)
|
||||
datetime_object = utc_from_timestamp.strftime("%d %B %Y, %I:%M:%S%p")
|
||||
return datetime_object
|
||||
|
||||
|
||||
def create_parser():
|
||||
"""
|
||||
Creates and configures an argument parser for the command line arguments.
|
||||
|
||||
:return: A configured argparse.ArgumentParser object ready to parse the command line arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="RPST (Reddit Post Scraping Tool) —by Richard Mwewa | https://about.me/rly0nheart",
|
||||
epilog="Given a subreddit name and a keyword, "
|
||||
"RPST returns all top (by default) posts that contain the specified keyword.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-k", "--keyword", help="The keyword to search for in the posts.", required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s", "--subreddit", help="The subreddit to scrape.", required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--limit",
|
||||
help="The maximum number of posts to scrape (1-100). (default: %(default)s)",
|
||||
default=10,
|
||||
type=int,
|
||||
choices=range(
|
||||
1, 101
|
||||
), # This enforces that the limit must be between 1 and 100 inclusive.
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--listing",
|
||||
default="top",
|
||||
const="top",
|
||||
nargs="?",
|
||||
choices=["controversial", "hot", "best", "new", "rising"],
|
||||
help="The type of posts to scrape (default: %(default)s)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--timeframe",
|
||||
default="all",
|
||||
const="all",
|
||||
nargs="?",
|
||||
choices=["hour", "day", "week", "month", "year", "all"],
|
||||
help="The timeframe from which to scrape posts (default: %(default)s)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
help="Write all found posts to a json file.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
help="Write all found posts to a csv file.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--debug",
|
||||
help="run rpst in debug mode (show network logs)",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def check_updates(version_tag: str):
|
||||
"""
|
||||
This function checks if there's a new release of a project on GitHub. If there is, it logs an
|
||||
information message and prints the release notes.
|
||||
|
||||
:param version_tag: A string representing the current version of the project.
|
||||
"""
|
||||
|
||||
# Make a GET request to the GitHub API to get the latest release of the project.
|
||||
response = requests.get(
|
||||
"https://api.github.com/repos/bellingcat/reddit-post-scraping-tool/releases/latest"
|
||||
).json()
|
||||
|
||||
# Check if the latest release's tag matches the current version tag.
|
||||
if response["tag_name"] != version_tag:
|
||||
# If not, convert the release notes from Markdown to HTML.
|
||||
raw_release_notes = response["body"]
|
||||
|
||||
# Log an info message about the new release.
|
||||
xprint(
|
||||
f"{glyph.up_arrow} A new release of RPST is available ({response['tag_name']}). "
|
||||
f"Run 'pip install --upgrade reddit-post-scraping-tool' to get the updates."
|
||||
)
|
||||
|
||||
# Print the release notes.
|
||||
xprint(Markdown(raw_release_notes))
|
||||
|
||||
|
||||
def set_loglevel(args: argparse) -> logging.getLogger:
|
||||
"""
|
||||
Configures the logging level based on the provided arguments.
|
||||
|
||||
If `args.debug` is True, the logging level is set to "NOTSET," allowing all log messages to be displayed.
|
||||
Otherwise, the logging level is set to "INFO," and only informational and higher-severity messages are displayed.
|
||||
|
||||
The function also configures a RichHandler for formatting the log messages,
|
||||
including a specific time format and hiding the log level.
|
||||
|
||||
:param args: A namespace object from argparse containing the debugging option (args.debug).
|
||||
:return: A logger object associated with the name "rich."
|
||||
"""
|
||||
if args.debug:
|
||||
logging.basicConfig(
|
||||
level="NOTSET",
|
||||
format="%(message)s",
|
||||
handlers=[
|
||||
RichHandler(
|
||||
markup=True, log_time_format="[%H:%M:%S%p]", show_level=False
|
||||
)
|
||||
],
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
level="INFO",
|
||||
format="%(message)s",
|
||||
handlers=[
|
||||
RichHandler(
|
||||
markup=True, log_time_format="[%H:%M:%S%p]", show_level=False
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
return logging.getLogger("rich")
|
||||
|
||||
|
||||
def write_post_data(post_data: dict, filename: str, args, tree_branch: Tree):
|
||||
"""
|
||||
Writes post data to a specified JSON or CSV file based on the args provided, and updates
|
||||
the provided tree with the status.
|
||||
|
||||
:param post_data: A dictionary containing post data.
|
||||
:param filename: The name of the file to which post data will be written.
|
||||
:param args: A namespace object from argparse containing the output format options (args.json or args.csv).
|
||||
:param tree_branch: A rich Tree object to which status information will be added.
|
||||
"""
|
||||
home_directory = os.path.expanduser("~")
|
||||
|
||||
if args.json:
|
||||
json_file_path = os.path.join(home_directory, f"{filename}.json")
|
||||
with open(json_file_path, "a", encoding="utf-8") as file:
|
||||
file.write(json.dumps(post_data, ensure_ascii=False))
|
||||
file.write("\n") # Separate posts with newline
|
||||
tree_branch.add(
|
||||
f"{glyph.page_facing_up} JSON data successfully written/appended to file: "
|
||||
f"[italic][link file://{json_file_path}]{json_file_path}[/]"
|
||||
)
|
||||
else:
|
||||
tree_branch.add(
|
||||
f"{glyph.cross_mark_button} JSON data writing operation was skipped. No changes made."
|
||||
)
|
||||
|
||||
if args.csv:
|
||||
csv_file_path = os.path.join(home_directory, f"{filename}.csv")
|
||||
with open(csv_file_path, "a", newline="", encoding="utf-8") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=post_data.keys())
|
||||
|
||||
# Write headers if file is empty
|
||||
if csvfile.tell() == 0:
|
||||
writer.writeheader()
|
||||
|
||||
writer.writerow(post_data)
|
||||
tree_branch.add(
|
||||
f"{glyph.page_facing_up} CSV data successfully written/appended to file: "
|
||||
f"[italic][link file://{csv_file_path}]{csv_file_path}[/]"
|
||||
)
|
||||
else:
|
||||
tree_branch.add(
|
||||
f"{glyph.cross_mark_button} CSV data writing operation was skipped. No changes made."
|
||||
)
|
||||
Reference in New Issue
Block a user