From d3b8e1a3b373755d79b2ad1458685b26ffb4612e Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 3 Aug 2023 18:05:50 -0500 Subject: [PATCH] removed unused archive_media argument passed to methods throughout codebase --- app.py | 11 ++++------- cisticola/scraper/base.py | 26 +++++++++----------------- cisticola/scraper/bitchute.py | 10 ++-------- cisticola/scraper/gab.py | 11 ++--------- cisticola/scraper/gettr.py | 11 ++--------- cisticola/scraper/instagram.py | 11 ++--------- cisticola/scraper/odysee.py | 16 ++-------------- cisticola/scraper/rumble.py | 10 ++-------- cisticola/scraper/telegram_telethon.py | 13 +++---------- cisticola/scraper/twitter.py | 9 ++------- cisticola/scraper/vkontakte.py | 15 ++------------- cisticola/scraper/youtube.py | 17 +++-------------- tests/base.py | 2 +- 13 files changed, 36 insertions(+), 126 deletions(-) diff --git a/app.py b/app.py index afa3b6c..8be3607 100644 --- a/app.py +++ b/app.py @@ -81,16 +81,16 @@ def get_transformer_controller(args): def scrape_channels(args): - logger.info(f"Scraping channels, media: {args.media}") + logger.info(f"Scraping channels") controller = get_scraper_controller(args) - controller.scrape_all_channels(archive_media=args.media) + controller.scrape_all_channels() def scrape_channels_old(args): - logger.info(f"Scraping old posts from channels, media: {args.media}") + logger.info(f"Scraping old posts from channels") controller = get_scraper_controller(args) - controller.scrape_all_channels(archive_media=args.media, fetch_old=True) + controller.scrape_all_channels(fetch_old=True) def scrape_channel_info(args): logger.info(f"Scraping channel info") @@ -153,9 +153,6 @@ if __name__ == "__main__": parser.add_argument( "--gsheet", type=str, help="[sync-channels] URL of Google Sheet to synchronize" ) - parser.add_argument( - "--media", action="store_true", help="[scrape-channels] Add this flag to media" - ) parser.add_argument("--chronological", action="store_true") parser.add_argument("--telethon_session", type=str) parser.add_argument("--min_date", type=str) diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index a520e0d..bb089bd 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -11,6 +11,7 @@ import ffmpeg from sqlalchemy.orm import sessionmaker import yt_dlp from sqlalchemy.sql.expression import func +from sqlalchemy.orm.session import close_all_sessions from pathlib import Path from sqlalchemy import nullsfirst @@ -256,7 +257,7 @@ class Scraper: Parameters ---------- result: ScraperResult - Previously scraped ScraperResult run with ``archive_media=False``. + Previously scraped ScraperResult. Returns ------- @@ -291,7 +292,7 @@ class Scraper: raise NotImplementedError @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: """Scrape all posts from the specified Channel. Parameters @@ -301,9 +302,6 @@ class Scraper: since: ScraperResult or None Most recently scraped ScraperResult from a previous scrape, or ``None`` if scraper has not run before. - archive_media: bool - If ``True``, any media files (images, video, etc.) from posts are archived. - If ``False``, media files are not archived. Yields ------ @@ -348,14 +346,11 @@ class ScraperController: """ self.scrapers = [] - def scrape_all_channels(self, archive_media: bool = True, fetch_old: bool = False): + def scrape_all_channels(self, fetch_old: bool = False): """Scrape posts from all channels in the database, that satisfy a researcher-specified criteria Parameters ---------- - archive_media: bool - If ``True``, any media files (images, video, etc.) from posts are archived. - If ``False``, media files are not archived. fetch_old: bool If ``True``, scrape all posts from channels, regardless of when channel was last scraped. If ``False``, scrape only posts that are more recent than the previous scrape of each channel. @@ -371,7 +366,7 @@ class ScraperController: session.close() - return self.scrape_channels(channels, archive_media=archive_media, fetch_old=fetch_old) + return self.scrape_channels(channels, fetch_old=fetch_old) def scrape_all_channel_info(self): """Scrape profile information from all channels in the database. @@ -393,16 +388,13 @@ class ScraperController: session.close() return self.scrape_channel_info(channels) - def scrape_channels(self, channels: List[Channel], archive_media: bool = True, fetch_old: bool = False): + def scrape_channels(self, channels: List[Channel], fetch_old: bool = False): """Scrape all posts from a specified list of channels. Parameters ---------- channels: list List of Channel instances to be scraped - archive_media: bool - If ``True``, any media files (images, video, etc.) from posts are archived. - If ``False``, media files are not archived. fetch_old: bool If ``True``, scrape all posts from channels, regardless of when channel was last scraped. If ``False``, scrape only posts that are more recent than the previous scrape of each channel. @@ -450,7 +442,7 @@ class ScraperController: else: until = None - posts = scraper.get_posts(channel, until=until, archive_media=archive_media) + posts = scraper.get_posts(channel, until=until) else: # get most recent post @@ -466,7 +458,7 @@ class ScraperController: else: since = None - posts = scraper.get_posts(channel, since=since, archive_media=archive_media) + posts = scraper.get_posts(channel, since=since) for post in posts: session.add(post) @@ -610,7 +602,7 @@ class ScraperController: """Drop all data from the connected SQLAlchemy database. """ - self.session.close_all() + close_all_sessions() mapper_registry.metadata.drop_all(bind=self.engine) self.connect_to_db(self.engine) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index f5c79fb..145c3dc 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -25,7 +25,7 @@ class BitchuteScraper(Scraper): return username @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: session = requests.Session() session.headers.update(self.headers) @@ -50,12 +50,6 @@ class BitchuteScraper(Scraper): url = post['video_url'] archived_urls[url] = None - if archive_media: - - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - yield ScraperResult( scraper=self.__version__, platform="Bitchute", @@ -65,7 +59,7 @@ class BitchuteScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) def can_handle(self, channel): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py index 5602489..b4bdd3d 100644 --- a/cisticola/scraper/gab.py +++ b/cisticola/scraper/gab.py @@ -24,7 +24,7 @@ class GabScraper(Scraper): return group_id @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: client = Client( username = os.environ['GAB_USER'], password = os.environ['GAB_PASS'], @@ -67,13 +67,6 @@ class GabScraper(Scraper): else: archived_urls[attachment['url']] = None - for url in archived_urls.keys(): - - if archive_media: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - yield ScraperResult( scraper=self.__version__, platform="Gab", @@ -83,7 +76,7 @@ class GabScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) def can_handle(self, channel: Channel) -> bool: if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/gettr.py b/cisticola/scraper/gettr.py index 1fa0599..953a771 100644 --- a/cisticola/scraper/gettr.py +++ b/cisticola/scraper/gettr.py @@ -21,7 +21,7 @@ class GettrScraper(Scraper): return username @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: client = PublicClient() username = self.get_username_from_url(channel.url).lower() scraper = client.user_activity(username=username, type="posts") @@ -45,13 +45,6 @@ class GettrScraper(Scraper): url = "https://media.gettr.com/" + post['ovid'] archived_urls[url] = None - for url in archived_urls.keys(): - - if archive_media: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - yield ScraperResult( scraper=self.__version__, platform="Gettr", @@ -61,7 +54,7 @@ class GettrScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) def can_handle(self, channel): if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None: diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py index 3eca83e..04c0ca4 100644 --- a/cisticola/scraper/instagram.py +++ b/cisticola/scraper/instagram.py @@ -26,7 +26,7 @@ class InstagramScraper(Scraper): return username @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) @@ -52,13 +52,6 @@ class InstagramScraper(Scraper): archived_urls = get_archived_urls_from_post(post = post) - for url in archived_urls.keys(): - - if archive_media: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - yield ScraperResult( scraper=self.__version__, platform="Instagram", @@ -68,7 +61,7 @@ class InstagramScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post._asdict(), default=str), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) for comment in post.get_comments(): diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py index 1434204..a086470 100644 --- a/cisticola/scraper/odysee.py +++ b/cisticola/scraper/odysee.py @@ -26,7 +26,7 @@ class OdyseeScraper(Scraper): return username @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token) @@ -43,18 +43,6 @@ class OdyseeScraper(Scraper): else: archived_urls = {url: None} - if archive_media: - - # Check if file is a video file or an m3u8 file - r = requests.head(url) - if r.headers['Content-Type'] == 'text/html; charset=utf-8': - media_blob, content_type, key = self.m3u8_url_to_blob(url) - else: - media_blob, content_type, key = self.url_to_blob(url) - - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - raw_comment_info_list = get_all_comments(video_id=video.claim_id) all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list) @@ -67,7 +55,7 @@ class OdyseeScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(video.__dict__, default = str), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) for comment in all_comments: diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index b044a5f..05f65a0 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -20,7 +20,7 @@ class RumbleScraper(Scraper): cookiefilename = 'cookiefile.txt' @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: scraper = get_channel_videos(channel.url) @@ -32,12 +32,6 @@ class RumbleScraper(Scraper): archived_urls = {url: None} - if archive_media: - - media_blob, content_type, key = self.ytdlp_url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - yield ScraperResult( scraper=self.__version__, platform="Rumble", @@ -47,7 +41,7 @@ class RumbleScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post, default = str), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) def url_to_key(self, url: str, content_type: str) -> str: ext = '.' + content_type.split('/')[-1] diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index b60162e..8f4c536 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -131,8 +131,8 @@ class TelegramTelethonScraper(Scraper): if channel.platform == "Telegram": return True - @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + # @logger.catch + def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None) -> Generator[ScraperResult, None, None]: username = TelegramTelethonScraper.get_channel_identifier(channel) if until is not None: logger.info(f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}") @@ -157,13 +157,6 @@ class TelegramTelethonScraper(Scraper): archived_urls[post_url] = None media_archived = None - # if archive_media: - # blob, output_file_with_ext = self.archive_post_media(post, client) - # if blob is not None: - # # TODO specify Content-Type - # archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext) - # archived_urls[post_url] = archived_url - yield ScraperResult( scraper=self.__version__, platform="Telegram", @@ -187,7 +180,7 @@ class TelegramTelethonScraper(Scraper): raw_data=json.dumps(post.to_dict(), default=str), archived_urls=archived_urls, media_archived=media_archived) - for p in self.get_posts(channel, since=since, until=new_until, archive_media=archive_media): + for p in self.get_posts(channel, since=since, until=new_until): yield p diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 3ccef23..318cb8d 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -13,7 +13,7 @@ class TwitterScraper(Scraper): __version__ = "TwitterScraper 0.0.0" @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: if channel.platform_id: identifier = int(channel.platform_id) else: @@ -60,11 +60,6 @@ class TwitterScraper(Scraper): if url is not None and url not in archived_urls: archived_urls[url] = None - if archive_media: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - yield ScraperResult( scraper=self.__version__, platform="Twitter", @@ -74,7 +69,7 @@ class TwitterScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=tweet.json(), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) def can_handle(self, channel): if channel.platform == "Twitter" and (channel.platform_id or channel.screenname): diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py index 3ef1648..f7fd7d8 100644 --- a/cisticola/scraper/vkontakte.py +++ b/cisticola/scraper/vkontakte.py @@ -21,7 +21,7 @@ class VkontakteScraper(Scraper): return username @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: username = self.get_username_from_url(channel.url) scraper = VKontakteUserScraper(username) @@ -51,17 +51,6 @@ class VkontakteScraper(Scraper): if post.video: archived_urls[post.video.url] = None - for url in archived_urls.keys(): - - if archive_media: - if re.match(VKIE._VALID_URL, url): - # Uses regex from yt_dlp to verify VK video URL - media_blob, content_type, key = self.ytdlp_url_to_blob(url) - else: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url - yield ScraperResult( scraper=self.__version__, platform="VK", @@ -71,7 +60,7 @@ class VkontakteScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=post.json(), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) @logger.catch def archive_files(self, result: ScraperResult) -> ScraperResult: diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py index f1d8455..e4210f9 100644 --- a/cisticola/scraper/youtube.py +++ b/cisticola/scraper/youtube.py @@ -19,7 +19,7 @@ class YoutubeScraper(Scraper): cookiefilename = 'cookiefile.txt' @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: content_type = 'video/mp4' @@ -53,7 +53,7 @@ class YoutubeScraper(Scraper): try: meta = ydl.extract_info( channel.url, - download=archive_media) + download=False) except yt_dlp.utils.DownloadError as e: raise e else: @@ -67,17 +67,6 @@ class YoutubeScraper(Scraper): archived_urls = {url: None} video_id = video["id"] - video_ext = video["ext"] - - if archive_media: - - key = f"{video_id}.{video_ext}" - - with open(Path(temp_dir)/key, "rb") as f: - media_blob = f.read() - - archived_url = self.archive_blob(media_blob, content_type, key) - archived_urls[url] = archived_url yield ScraperResult( scraper=self.__version__, @@ -88,7 +77,7 @@ class YoutubeScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(video, default = str), archived_urls=archived_urls, - media_archived=datetime.now(timezone.utc) if archive_media else None) + media_archived=None) def can_handle(self, channel): if channel.platform == "Youtube" and channel.url: diff --git a/tests/base.py b/tests/base.py index 50c6723..2cc2bb8 100644 --- a/tests/base.py +++ b/tests/base.py @@ -46,7 +46,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller, scraper = CONTROLLERS[platform]['scraper'] controller.register_scraper(scraper = scraper()) - controller.scrape_channels(channels = channels, archive_media = False) + controller.scrape_channels(channels = channels) controller.scrape_all_channel_info() controller.archive_unarchived_media_batch()