mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-07 19:08:35 +03:00
removed unused archive_media argument passed to methods throughout codebase
This commit is contained in:
11
app.py
11
app.py
@@ -81,16 +81,16 @@ def get_transformer_controller(args):
|
||||
|
||||
|
||||
def scrape_channels(args):
|
||||
logger.info(f"Scraping channels, media: {args.media}")
|
||||
logger.info(f"Scraping channels")
|
||||
|
||||
controller = get_scraper_controller(args)
|
||||
controller.scrape_all_channels(archive_media=args.media)
|
||||
controller.scrape_all_channels()
|
||||
|
||||
def scrape_channels_old(args):
|
||||
logger.info(f"Scraping old posts from channels, media: {args.media}")
|
||||
logger.info(f"Scraping old posts from channels")
|
||||
|
||||
controller = get_scraper_controller(args)
|
||||
controller.scrape_all_channels(archive_media=args.media, fetch_old=True)
|
||||
controller.scrape_all_channels(fetch_old=True)
|
||||
|
||||
def scrape_channel_info(args):
|
||||
logger.info(f"Scraping channel info")
|
||||
@@ -153,9 +153,6 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--gsheet", type=str, help="[sync-channels] URL of Google Sheet to synchronize"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--media", action="store_true", help="[scrape-channels] Add this flag to media"
|
||||
)
|
||||
parser.add_argument("--chronological", action="store_true")
|
||||
parser.add_argument("--telethon_session", type=str)
|
||||
parser.add_argument("--min_date", type=str)
|
||||
|
||||
@@ -11,6 +11,7 @@ import ffmpeg
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import yt_dlp
|
||||
from sqlalchemy.sql.expression import func
|
||||
from sqlalchemy.orm.session import close_all_sessions
|
||||
from pathlib import Path
|
||||
from sqlalchemy import nullsfirst
|
||||
|
||||
@@ -256,7 +257,7 @@ class Scraper:
|
||||
Parameters
|
||||
----------
|
||||
result: ScraperResult
|
||||
Previously scraped ScraperResult run with ``archive_media=False``.
|
||||
Previously scraped ScraperResult.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -291,7 +292,7 @@ class Scraper:
|
||||
raise NotImplementedError
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
"""Scrape all posts from the specified Channel.
|
||||
|
||||
Parameters
|
||||
@@ -301,9 +302,6 @@ class Scraper:
|
||||
since: ScraperResult or None
|
||||
Most recently scraped ScraperResult from a previous scrape, or
|
||||
``None`` if scraper has not run before.
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
|
||||
Yields
|
||||
------
|
||||
@@ -348,14 +346,11 @@ class ScraperController:
|
||||
"""
|
||||
self.scrapers = []
|
||||
|
||||
def scrape_all_channels(self, archive_media: bool = True, fetch_old: bool = False):
|
||||
def scrape_all_channels(self, fetch_old: bool = False):
|
||||
"""Scrape posts from all channels in the database, that satisfy a researcher-specified criteria
|
||||
|
||||
Parameters
|
||||
----------
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
fetch_old: bool
|
||||
If ``True``, scrape all posts from channels, regardless of when channel was last scraped.
|
||||
If ``False``, scrape only posts that are more recent than the previous scrape of each channel.
|
||||
@@ -371,7 +366,7 @@ class ScraperController:
|
||||
|
||||
session.close()
|
||||
|
||||
return self.scrape_channels(channels, archive_media=archive_media, fetch_old=fetch_old)
|
||||
return self.scrape_channels(channels, fetch_old=fetch_old)
|
||||
|
||||
def scrape_all_channel_info(self):
|
||||
"""Scrape profile information from all channels in the database.
|
||||
@@ -393,16 +388,13 @@ class ScraperController:
|
||||
session.close()
|
||||
return self.scrape_channel_info(channels)
|
||||
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True, fetch_old: bool = False):
|
||||
def scrape_channels(self, channels: List[Channel], fetch_old: bool = False):
|
||||
"""Scrape all posts from a specified list of channels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channels: list<Channel>
|
||||
List of Channel instances to be scraped
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
fetch_old: bool
|
||||
If ``True``, scrape all posts from channels, regardless of when channel was last scraped.
|
||||
If ``False``, scrape only posts that are more recent than the previous scrape of each channel.
|
||||
@@ -450,7 +442,7 @@ class ScraperController:
|
||||
else:
|
||||
until = None
|
||||
|
||||
posts = scraper.get_posts(channel, until=until, archive_media=archive_media)
|
||||
posts = scraper.get_posts(channel, until=until)
|
||||
|
||||
else:
|
||||
# get most recent post
|
||||
@@ -466,7 +458,7 @@ class ScraperController:
|
||||
else:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since, archive_media=archive_media)
|
||||
posts = scraper.get_posts(channel, since=since)
|
||||
|
||||
for post in posts:
|
||||
session.add(post)
|
||||
@@ -610,7 +602,7 @@ class ScraperController:
|
||||
"""Drop all data from the connected SQLAlchemy database.
|
||||
"""
|
||||
|
||||
self.session.close_all()
|
||||
close_all_sessions()
|
||||
|
||||
mapper_registry.metadata.drop_all(bind=self.engine)
|
||||
self.connect_to_db(self.engine)
|
||||
|
||||
@@ -25,7 +25,7 @@ class BitchuteScraper(Scraper):
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update(self.headers)
|
||||
@@ -50,12 +50,6 @@ class BitchuteScraper(Scraper):
|
||||
url = post['video_url']
|
||||
archived_urls[url] = None
|
||||
|
||||
if archive_media:
|
||||
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Bitchute",
|
||||
@@ -65,7 +59,7 @@ class BitchuteScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -24,7 +24,7 @@ class GabScraper(Scraper):
|
||||
return group_id
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
client = Client(
|
||||
username = os.environ['GAB_USER'],
|
||||
password = os.environ['GAB_PASS'],
|
||||
@@ -67,13 +67,6 @@ class GabScraper(Scraper):
|
||||
else:
|
||||
archived_urls[attachment['url']] = None
|
||||
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gab",
|
||||
@@ -83,7 +76,7 @@ class GabScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -21,7 +21,7 @@ class GettrScraper(Scraper):
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
client = PublicClient()
|
||||
username = self.get_username_from_url(channel.url).lower()
|
||||
scraper = client.user_activity(username=username, type="posts")
|
||||
@@ -45,13 +45,6 @@ class GettrScraper(Scraper):
|
||||
url = "https://media.gettr.com/" + post['ovid']
|
||||
archived_urls[url] = None
|
||||
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gettr",
|
||||
@@ -61,7 +54,7 @@ class GettrScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -26,7 +26,7 @@ class InstagramScraper(Scraper):
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
@@ -52,13 +52,6 @@ class InstagramScraper(Scraper):
|
||||
|
||||
archived_urls = get_archived_urls_from_post(post = post)
|
||||
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Instagram",
|
||||
@@ -68,7 +61,7 @@ class InstagramScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post._asdict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
for comment in post.get_comments():
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ class OdyseeScraper(Scraper):
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
|
||||
@@ -43,18 +43,6 @@ class OdyseeScraper(Scraper):
|
||||
else:
|
||||
archived_urls = {url: None}
|
||||
|
||||
if archive_media:
|
||||
|
||||
# Check if file is a video file or an m3u8 file
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
raw_comment_info_list = get_all_comments(video_id=video.claim_id)
|
||||
all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)
|
||||
|
||||
@@ -67,7 +55,7 @@ class OdyseeScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video.__dict__, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
for comment in all_comments:
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ class RumbleScraper(Scraper):
|
||||
cookiefilename = 'cookiefile.txt'
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
scraper = get_channel_videos(channel.url)
|
||||
|
||||
@@ -32,12 +32,6 @@ class RumbleScraper(Scraper):
|
||||
|
||||
archived_urls = {url: None}
|
||||
|
||||
if archive_media:
|
||||
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Rumble",
|
||||
@@ -47,7 +41,7 @@ class RumbleScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
|
||||
@@ -131,8 +131,8 @@ class TelegramTelethonScraper(Scraper):
|
||||
if channel.platform == "Telegram":
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
# @logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
username = TelegramTelethonScraper.get_channel_identifier(channel)
|
||||
if until is not None:
|
||||
logger.info(f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}")
|
||||
@@ -157,13 +157,6 @@ class TelegramTelethonScraper(Scraper):
|
||||
archived_urls[post_url] = None
|
||||
media_archived = None
|
||||
|
||||
# if archive_media:
|
||||
# blob, output_file_with_ext = self.archive_post_media(post, client)
|
||||
# if blob is not None:
|
||||
# # TODO specify Content-Type
|
||||
# archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
|
||||
# archived_urls[post_url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Telegram",
|
||||
@@ -187,7 +180,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
raw_data=json.dumps(post.to_dict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=media_archived)
|
||||
for p in self.get_posts(channel, since=since, until=new_until, archive_media=archive_media):
|
||||
for p in self.get_posts(channel, since=since, until=new_until):
|
||||
yield p
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ class TwitterScraper(Scraper):
|
||||
__version__ = "TwitterScraper 0.0.0"
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
if channel.platform_id:
|
||||
identifier = int(channel.platform_id)
|
||||
else:
|
||||
@@ -60,11 +60,6 @@ class TwitterScraper(Scraper):
|
||||
if url is not None and url not in archived_urls:
|
||||
archived_urls[url] = None
|
||||
|
||||
if archive_media:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Twitter",
|
||||
@@ -74,7 +69,7 @@ class TwitterScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=tweet.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):
|
||||
|
||||
@@ -21,7 +21,7 @@ class VkontakteScraper(Scraper):
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = VKontakteUserScraper(username)
|
||||
@@ -51,17 +51,6 @@ class VkontakteScraper(Scraper):
|
||||
if post.video:
|
||||
archived_urls[post.video.url] = None
|
||||
|
||||
for url in archived_urls.keys():
|
||||
|
||||
if archive_media:
|
||||
if re.match(VKIE._VALID_URL, url):
|
||||
# Uses regex from yt_dlp to verify VK video URL
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="VK",
|
||||
@@ -71,7 +60,7 @@ class VkontakteScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
@logger.catch
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
|
||||
@@ -19,7 +19,7 @@ class YoutubeScraper(Scraper):
|
||||
cookiefilename = 'cookiefile.txt'
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
@@ -53,7 +53,7 @@ class YoutubeScraper(Scraper):
|
||||
try:
|
||||
meta = ydl.extract_info(
|
||||
channel.url,
|
||||
download=archive_media)
|
||||
download=False)
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
else:
|
||||
@@ -67,17 +67,6 @@ class YoutubeScraper(Scraper):
|
||||
archived_urls = {url: None}
|
||||
|
||||
video_id = video["id"]
|
||||
video_ext = video["ext"]
|
||||
|
||||
if archive_media:
|
||||
|
||||
key = f"{video_id}.{video_ext}"
|
||||
|
||||
with open(Path(temp_dir)/key, "rb") as f:
|
||||
media_blob = f.read()
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
@@ -88,7 +77,7 @@ class YoutubeScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
media_archived=None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Youtube" and channel.url:
|
||||
|
||||
@@ -46,7 +46,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
|
||||
scraper = CONTROLLERS[platform]['scraper']
|
||||
controller.register_scraper(scraper = scraper())
|
||||
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
controller.scrape_channels(channels = channels)
|
||||
controller.scrape_all_channel_info()
|
||||
controller.archive_unarchived_media_batch()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user