removed unused archive_media argument passed to methods throughout codebase

This commit is contained in:
Tristan Lee
2023-08-03 18:05:50 -05:00
parent edd772eb94
commit d3b8e1a3b3
13 changed files with 36 additions and 126 deletions

11
app.py
View File

@@ -81,16 +81,16 @@ def get_transformer_controller(args):
def scrape_channels(args):
logger.info(f"Scraping channels, media: {args.media}")
logger.info(f"Scraping channels")
controller = get_scraper_controller(args)
controller.scrape_all_channels(archive_media=args.media)
controller.scrape_all_channels()
def scrape_channels_old(args):
logger.info(f"Scraping old posts from channels, media: {args.media}")
logger.info(f"Scraping old posts from channels")
controller = get_scraper_controller(args)
controller.scrape_all_channels(archive_media=args.media, fetch_old=True)
controller.scrape_all_channels(fetch_old=True)
def scrape_channel_info(args):
logger.info(f"Scraping channel info")
@@ -153,9 +153,6 @@ if __name__ == "__main__":
parser.add_argument(
"--gsheet", type=str, help="[sync-channels] URL of Google Sheet to synchronize"
)
parser.add_argument(
"--media", action="store_true", help="[scrape-channels] Add this flag to media"
)
parser.add_argument("--chronological", action="store_true")
parser.add_argument("--telethon_session", type=str)
parser.add_argument("--min_date", type=str)

View File

@@ -11,6 +11,7 @@ import ffmpeg
from sqlalchemy.orm import sessionmaker
import yt_dlp
from sqlalchemy.sql.expression import func
from sqlalchemy.orm.session import close_all_sessions
from pathlib import Path
from sqlalchemy import nullsfirst
@@ -256,7 +257,7 @@ class Scraper:
Parameters
----------
result: ScraperResult
Previously scraped ScraperResult run with ``archive_media=False``.
Previously scraped ScraperResult.
Returns
-------
@@ -291,7 +292,7 @@ class Scraper:
raise NotImplementedError
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
"""Scrape all posts from the specified Channel.
Parameters
@@ -301,9 +302,6 @@ class Scraper:
since: ScraperResult or None
Most recently scraped ScraperResult from a previous scrape, or
``None`` if scraper has not run before.
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
Yields
------
@@ -348,14 +346,11 @@ class ScraperController:
"""
self.scrapers = []
def scrape_all_channels(self, archive_media: bool = True, fetch_old: bool = False):
def scrape_all_channels(self, fetch_old: bool = False):
"""Scrape posts from all channels in the database, that satisfy a researcher-specified criteria
Parameters
----------
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
fetch_old: bool
If ``True``, scrape all posts from channels, regardless of when channel was last scraped.
If ``False``, scrape only posts that are more recent than the previous scrape of each channel.
@@ -371,7 +366,7 @@ class ScraperController:
session.close()
return self.scrape_channels(channels, archive_media=archive_media, fetch_old=fetch_old)
return self.scrape_channels(channels, fetch_old=fetch_old)
def scrape_all_channel_info(self):
"""Scrape profile information from all channels in the database.
@@ -393,16 +388,13 @@ class ScraperController:
session.close()
return self.scrape_channel_info(channels)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True, fetch_old: bool = False):
def scrape_channels(self, channels: List[Channel], fetch_old: bool = False):
"""Scrape all posts from a specified list of channels.
Parameters
----------
channels: list<Channel>
List of Channel instances to be scraped
archive_media: bool
If ``True``, any media files (images, video, etc.) from posts are archived.
If ``False``, media files are not archived.
fetch_old: bool
If ``True``, scrape all posts from channels, regardless of when channel was last scraped.
If ``False``, scrape only posts that are more recent than the previous scrape of each channel.
@@ -450,7 +442,7 @@ class ScraperController:
else:
until = None
posts = scraper.get_posts(channel, until=until, archive_media=archive_media)
posts = scraper.get_posts(channel, until=until)
else:
# get most recent post
@@ -466,7 +458,7 @@ class ScraperController:
else:
since = None
posts = scraper.get_posts(channel, since=since, archive_media=archive_media)
posts = scraper.get_posts(channel, since=since)
for post in posts:
session.add(post)
@@ -610,7 +602,7 @@ class ScraperController:
"""Drop all data from the connected SQLAlchemy database.
"""
self.session.close_all()
close_all_sessions()
mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine)

View File

@@ -25,7 +25,7 @@ class BitchuteScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
session = requests.Session()
session.headers.update(self.headers)
@@ -50,12 +50,6 @@ class BitchuteScraper(Scraper):
url = post['video_url']
archived_urls[url] = None
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Bitchute",
@@ -65,7 +59,7 @@ class BitchuteScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
def can_handle(self, channel):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:

View File

@@ -24,7 +24,7 @@ class GabScraper(Scraper):
return group_id
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
client = Client(
username = os.environ['GAB_USER'],
password = os.environ['GAB_PASS'],
@@ -67,13 +67,6 @@ class GabScraper(Scraper):
else:
archived_urls[attachment['url']] = None
for url in archived_urls.keys():
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Gab",
@@ -83,7 +76,7 @@ class GabScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
def can_handle(self, channel: Channel) -> bool:
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:

View File

@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = self.get_username_from_url(channel.url).lower()
scraper = client.user_activity(username=username, type="posts")
@@ -45,13 +45,6 @@ class GettrScraper(Scraper):
url = "https://media.gettr.com/" + post['ovid']
archived_urls[url] = None
for url in archived_urls.keys():
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Gettr",
@@ -61,7 +54,7 @@ class GettrScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
def can_handle(self, channel):
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:

View File

@@ -26,7 +26,7 @@ class InstagramScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
@@ -52,13 +52,6 @@ class InstagramScraper(Scraper):
archived_urls = get_archived_urls_from_post(post = post)
for url in archived_urls.keys():
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Instagram",
@@ -68,7 +61,7 @@ class InstagramScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post._asdict(), default=str),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
for comment in post.get_comments():

View File

@@ -26,7 +26,7 @@ class OdyseeScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
@@ -43,18 +43,6 @@ class OdyseeScraper(Scraper):
else:
archived_urls = {url: None}
if archive_media:
# Check if file is a video file or an m3u8 file
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
raw_comment_info_list = get_all_comments(video_id=video.claim_id)
all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)
@@ -67,7 +55,7 @@ class OdyseeScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.__dict__, default = str),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
for comment in all_comments:

View File

@@ -20,7 +20,7 @@ class RumbleScraper(Scraper):
cookiefilename = 'cookiefile.txt'
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
scraper = get_channel_videos(channel.url)
@@ -32,12 +32,6 @@ class RumbleScraper(Scraper):
archived_urls = {url: None}
if archive_media:
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Rumble",
@@ -47,7 +41,7 @@ class RumbleScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post, default = str),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]

View File

@@ -131,8 +131,8 @@ class TelegramTelethonScraper(Scraper):
if channel.platform == "Telegram":
return True
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
# @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = TelegramTelethonScraper.get_channel_identifier(channel)
if until is not None:
logger.info(f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}")
@@ -157,13 +157,6 @@ class TelegramTelethonScraper(Scraper):
archived_urls[post_url] = None
media_archived = None
# if archive_media:
# blob, output_file_with_ext = self.archive_post_media(post, client)
# if blob is not None:
# # TODO specify Content-Type
# archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
# archived_urls[post_url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Telegram",
@@ -187,7 +180,7 @@ class TelegramTelethonScraper(Scraper):
raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=media_archived)
for p in self.get_posts(channel, since=since, until=new_until, archive_media=archive_media):
for p in self.get_posts(channel, since=since, until=new_until):
yield p

View File

@@ -13,7 +13,7 @@ class TwitterScraper(Scraper):
__version__ = "TwitterScraper 0.0.0"
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
if channel.platform_id:
identifier = int(channel.platform_id)
else:
@@ -60,11 +60,6 @@ class TwitterScraper(Scraper):
if url is not None and url not in archived_urls:
archived_urls[url] = None
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Twitter",
@@ -74,7 +69,7 @@ class TwitterScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=tweet.json(),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
def can_handle(self, channel):
if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):

View File

@@ -21,7 +21,7 @@ class VkontakteScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username)
@@ -51,17 +51,6 @@ class VkontakteScraper(Scraper):
if post.video:
archived_urls[post.video.url] = None
for url in archived_urls.keys():
if archive_media:
if re.match(VKIE._VALID_URL, url):
# Uses regex from yt_dlp to verify VK video URL
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="VK",
@@ -71,7 +60,7 @@ class VkontakteScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
@logger.catch
def archive_files(self, result: ScraperResult) -> ScraperResult:

View File

@@ -19,7 +19,7 @@ class YoutubeScraper(Scraper):
cookiefilename = 'cookiefile.txt'
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
content_type = 'video/mp4'
@@ -53,7 +53,7 @@ class YoutubeScraper(Scraper):
try:
meta = ydl.extract_info(
channel.url,
download=archive_media)
download=False)
except yt_dlp.utils.DownloadError as e:
raise e
else:
@@ -67,17 +67,6 @@ class YoutubeScraper(Scraper):
archived_urls = {url: None}
video_id = video["id"]
video_ext = video["ext"]
if archive_media:
key = f"{video_id}.{video_ext}"
with open(Path(temp_dir)/key, "rb") as f:
media_blob = f.read()
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -88,7 +77,7 @@ class YoutubeScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
media_archived=None)
def can_handle(self, channel):
if channel.platform == "Youtube" and channel.url:

View File

@@ -46,7 +46,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
scraper = CONTROLLERS[platform]['scraper']
controller.register_scraper(scraper = scraper())
controller.scrape_channels(channels = channels, archive_media = False)
controller.scrape_channels(channels = channels)
controller.scrape_all_channel_info()
controller.archive_unarchived_media_batch()